commit-strip-dl/commit.py at master · yashmehrotra/commit-strip-dl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import requests
import urllib
import urlparse
import re
from lxml import html
import pdb
import os
import sys

BASE_URL = 'http://www.commitstrip.com/en/{0}/{1}/page/{2}'

def fixurl(url):
    # turn string into unicode
    if not isinstance(url,unicode):
        url = url.decode('utf8')

    # parse it
    parsed = urlparse.urlsplit(url)

    # divide the netloc further
    userpass,at,hostport = parsed.netloc.rpartition('@')
    user,colon1,pass_ = userpass.partition(':')
    host,colon2,port = hostport.partition(':')

    # encode each component
    scheme = parsed.scheme.encode('utf8')
    user = urllib.quote(user.encode('utf8'))
    colon1 = colon1.encode('utf8')
    pass_ = urllib.quote(pass_.encode('utf8'))
    at = at.encode('utf8')
    host = host.encode('idna')
    colon2 = colon2.encode('utf8')
    port = port.encode('utf8')
    path = '/'.join(  # could be encoded slashes!
        urllib.quote(urllib.unquote(pce).encode('utf8'),'')
        for pce in parsed.path.split('/')
    )
    query = urllib.quote(urllib.unquote(parsed.query).encode('utf8'),'=&?/')
    fragment = urllib.quote(urllib.unquote(parsed.fragment).encode('utf8'))

    # put it back together
    netloc = ''.join((user,colon1,pass_,at,host,colon2,port))
    return urlparse.urlunsplit((scheme,netloc,path,query,fragment))

def create_dir(year, month):

    month_dict = {
        '1':'January',
        '2':'February',
        '3':'March',
        '4':'April',
        '5':'May',
        '6':'June',
        '7':'July',
        '8':'August',
        '9':'September',
        '10':'October',
        '11':'November',
        '12':'December'
    }

    dir_name = "{0} {1}".format(month_dict[str(month)],year)

    if not os.path.exists(dir_name):
        os.mkdir(dir_name)

    os.chdir(dir_name)

    download(year,month, month_dict[str(month)])

def download(year, month, month_full):

    for i in range(1,100):
        url = BASE_URL.format(year,month,i)
        r = requests.get(url)

        text = r.text
        tree = html.fromstring(text)
        #pdb.set_trace()

        try:

            month_list = tree.xpath('//ul[@id="collapsArch-{0}:3"]/li[@class="collapsing archives"]/a/@title'.format(year))
            count_list = tree.xpath('//ul[@id="collapsArch-{0}:3"]/li[@class="collapsing archives"]/a/span[@class="monthCount"]/text()'.format(year))
            for_an_error = tree.xpath('//ul[@id="collapsArch-{0}:3"]/li[@class="collapsing archives"]/a/span[@class="monthCount"]/text()'.format(year))[0]

            cur_month = month_list.index(month_full)
            cur_count = count_list[cur_month]

            cur_count = cur_count[1:-1]

            if tree.xpath('//div[@class="entry-content"]/p/img'):
                img_url = tree.xpath('//div[@class="entry-content"]/p/img/@src')[0]
            elif tree.xpath('//div[@class="entry-content"]/p/a/img/@src'):
                img_url = tree.xpath('//div[@class="entry-content"]/p/a/img/@src')[0]
            else:
                continue

            img_url = fixurl(img_url)
            title = tree.xpath( '//h1[@class="entry-title"]/a/text()')[0].encode('utf-8').decode('ascii','ignore')
            print 'Downloading ({1} of {2}) {0}'.format(title,i,cur_count)
            urllib.urlretrieve(img_url,title+'.jpg')

        except IndexError:

            print '{0} images downloaded'.format(i-1)
            print 'Enough for this month'
            break

        except Exception as e:

            print 'Looks like some problem came'
            print 'It is due to {0}'.format(e)
            break

if __name__ == "__main__":

    year = sys.argv[1]
    month = sys.argv[2]
    create_dir(year,month)