oerupartners.py 6.57 KB
Newer Older
Jim Tittsler's avatar
Jim Tittsler committed
1 2 3 4 5 6 7 8
#!/usr/bin/env python2
"""oerupartners.

Scrape OERu.org partner list to keep WikiEducator descriptions
and logos up to date.
"""

import os.path
9 10 11 12 13
import re
import time
import math
import ConfigParser
import hashlib
Jim Tittsler's avatar
Jim Tittsler committed
14 15 16
import requests
import logging
import mwclient
17 18
try:
    from cStringIO import StringIO
Jim Tittsler's avatar
Jim Tittsler committed
19
except ImportError:
20
    from StringIO import StringIO
Jim Tittsler's avatar
Jim Tittsler committed
21
from lxml import html, etree
22 23 24 25 26 27
from PIL import Image

OERu = 'http://OERu.org'
OERu_partner_page = OERu + '/oeru-partners/'

config = ConfigParser.ConfigParser()
Jim Tittsler's avatar
Jim Tittsler committed
28
config.read(['.wikieducator.rc', os.path.expanduser('~/.wikieducator.rc')])
29 30 31 32
we_user = config.get('login', 'user')
we_pass = config.get('login', 'password')

we = mwclient.Site('wikieducator.org', path='/')
Jim Tittsler's avatar
Jim Tittsler committed
33
we.login(username = we_user, password=we_pass)
34

Jim Tittsler's avatar
Jim Tittsler committed
35 36 37
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s',
                    filename='/tmp/oerupartners.log')
38 39 40 41 42 43 44 45 46

def get_partners(u):
    page = requests.get(u)
    tree = html.fromstring(page.text)
    partner_nodes = tree.xpath('//*[@id="isotope-partners"]/div[not(contains(@class,"partners-donor"))]/div/a')
    partners = []
    for node in partner_nodes:
        img = node.xpath('img')[0]
        partners.append({
Jim Tittsler's avatar
Jim Tittsler committed
47
            "name": img.xpath('string(@alt)').strip(),
48 49 50 51 52 53 54 55
            "url": OERu + node.xpath('string(@href)'),
            "img": OERu + img.xpath('string(@src)')
            })
    return partners

def get_image_files(we, partners):
    for partner in partners:
        # fetch the image
Jim Tittsler's avatar
Jim Tittsler committed
56
        logging.info(partner['img'])
57 58 59
        img = requests.get(partner['img'])
        sha1 = hashlib.sha1(img.content).hexdigest()

Jim Tittsler's avatar
Jim Tittsler committed
60 61
        # see if WE already has this image
        logging.info(" SHA1: %s", sha1)
62 63 64 65
        images = we.allimages(sha1=sha1, generator=True)
        count = 0
        for i in images:
            partner['file'] = i.page_title
Jim Tittsler's avatar
Jim Tittsler committed
66
            logging.info(" Exists: %s", i.page_title.encode('utf-8'))
67 68 69 70 71
            count += 1
        if count == 0:
            # upload the image
            pname = re.sub(r'[^- _a-z0-9]', '', partner['name'], flags=re.IGNORECASE)
            filename = pname + partner['img'][-4:]
Jim Tittsler's avatar
Jim Tittsler committed
72
            logging.info(" Upload: %s", filename)
73
            try:
Jim Tittsler's avatar
Jim Tittsler committed
74 75
                we.upload(file=StringIO(img.content),
                          filename=filename,
Jim Tittsler's avatar
Jim Tittsler committed
76
                          ignore=True,
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
                          description='OERu partner %s logo' % (partner['name']))
            except mwclient.errors.FileExists:
                print "FILE %s EXISTS!" % (filename)
            partner['file'] = filename
            page = we.Pages['File:' + filename]
            description = """
== Summary ==
{{Information
|Description=%s logo. Institutional logo used under fair dealing for OERu membership recognition on WikiEducator.
|Source=http://OERu.org/
|Author=%s
|Date=%s
|Permission=
|other_versions=
}}

== Licensing ==
{{Fair-use}}""" % (partner['name'], partner['name'], time.strftime('%Y-%m-%d'))
            page.save(text = description, summary=u'update description/license information',
Jim Tittsler's avatar
Jim Tittsler committed
96
                      minor=True)
97 98 99 100 101 102 103
        im = Image.open(StringIO(img.content))
        (partner['width'], partner['height']) = im.size
        del im
    return partners

def make_partner_pages(we, partners):
    for partner in partners:
Jim Tittsler's avatar
Jim Tittsler committed
104
        logging.info(partner['name'])
105 106 107 108
        # fetch the page
        partner_page = requests.get(partner['url'])
        tree = html.fromstring(partner_page.text)

109 110
        printname = tree.xpath('//h3/text()')
        printname = printname[0].strip() if len(printname) > 0 else ''
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
        tagline = tree.xpath('//h5/text()')
        tagline = tagline[0] if len(tagline) > 0 else ''
        body = tree.xpath('//*[@id="main"]/div/div[2]/div')
        body = etree.tostring(body[0])
        #print '====================================='
        #print
        #print partner['name']
        #print ' ', printname
        #print ' ', tagline
        #print body
        # very quick and very dirty HTML to wikitext
        body = body.replace('</a>', ']')
        body = re.sub(r'<a.*?href="([^"]*)"[^>]*>', r"[\1 ", body)
        body = re.sub(r'<((div[^>]*)|(p[^>]*)|(br/?))>', "\n", body)
        body = re.sub(r'</?em>', "''", body)
        body = re.sub(r'</[^>]*>', "", body)
        body = re.sub('<((p[^>]*)|(br/?))>', "\n", body)
        body = body.replace('&#8211;', "-")
        body = body.replace('&#8217;', "'")
        body = re.sub(r'&#822[01];', '"', body)
        body = body.replace('&#160;', ' ')
        body = re.sub(r'<((span)|(ul)|(ol))[^>]*>', '', body)
        body = body.replace('<li>', '*')
        #print '-------------------------------------'
        #print body
        pagebody = '''
<div style="width:640px; float:left;">
{{OERu_Partner_Header|name=%s}}
<onlyinclude>{{OERu_Partner_Summary|text=%s}}</onlyinclude>

{{OERu_Partner_Body|text=%s}}
</div>
{{OERu_Partner_Logo|img=File:%s|alt=%s|link=%s}}

[[Category:OERu Partner]]
''' % (printname, tagline, body, partner['file'], printname, partner['url'])
        page = we.Pages['OERu/Home/Partners/%s' % (partner['name'])]
        page.save(text=pagebody, summary=u'copy partner page from OERu.org')

def featured_partner(we, partners):
    numparts = len(partners)
    partners = sorted(partners, key=lambda k: k['name'])
    body = []
Jim Tittsler's avatar
Jim Tittsler committed
154
    body.append('<noinclude>This template is created by the [https://bitbucket.org/wikieducator/oerupartners/ oerupartners.py script], DO NOT HAND EDIT.')
Jim Tittsler's avatar
Jim Tittsler committed
155 156
    body.append('')
    body.append('</noinclude>{{Lozenge|style=background:#C3A8C9;|text=Featured OERu Partner}}')
157 158 159 160 161 162 163
    body.append('')
    body.append('')
    body.append('{{#switch:{{#expr: floor(({{#time:U}} mod 345600)/%d)}}' % (
        math.ceil(345600/numparts)))
    i = 0
    for partner in partners:
        body.append(' | %d = [[File:%s|left|%dpx|link=%s]] {{OERu Partner Feature|%s|%s}}' %
Jim Tittsler's avatar
Jim Tittsler committed
164 165
                    (i, partner['file'], min(200, partner['width']),
                     partner['url'], partner['name'], partner['url']))
166 167 168 169 170 171 172 173 174 175 176
        i += 1
    body.append('}}')
    page = we.Pages['Template:OERu Featured Partner']
    page.save(text="\n".join(body), summary=u'update featured partner display')

# get list of partner hashes
# add the image filename to each hash (uploading new images)
# update/make a partner page for each
# make a featured partner template
partners = get_partners(OERu_partner_page)
partners = get_image_files(we, partners)
Jim Tittsler's avatar
Jim Tittsler committed
177 178 179 180 181
#for partner in partners:
#    print partner['name']
#    print ' ', partner['url']
#    print ' ', partner['file']
#    print ' %s (%d x %d)' % (partner['img'], partner['width'], partner['height'])
182 183 184

make_partner_pages(we, partners)
featured_partner(we, partners)