Commit b35cff53 authored by Dave Lane's avatar Dave Lane
Browse files

separating the nodeJS and python functionality - this is the nodeJS-based toolset

parent 5b018675
# CouchDB actions
## Access via the Futon interface:
In browser use: https://couch.oerfoundation.org/_utils/
for a specific page, something like this: https://couch.oerfoundation.org/_utils/document.html?mentions/_design/ids
## Reading a view
To copy a view locally (so you can edit it (pretty printed)):
`curl https://user:pass@couch.oerfoundation.org/mentions/_design/ids | python -m json.tool > ids.json`
## Replacing a downloaded view with an update
`curl -X PUT https://user:pass@couch.oerfoundation.org/mentions/_design/ids -d @ids.json`
## Getting rid of [error] nonode@nohot database_does_not_exist re _users
To fix this, you can create a _users, _replicator, and _global_changes tables...
`curl -X PUT https://user:pass@couch.oerfoundation.org/_user`
`curl -X PUT https://user:pass@couch.oerfoundation.org/_replicator`
`curl -X PUT https://user:pass@couch.oerfoundation.org/_global_changes`
(see http://docs.couchdb.org/en/master/install/setup.html, http://guide.couchdb.org/draft/views.html)
More complex queries (using Bash!!):
`curl 'https://bot:vUAQo58A8Fq9wq@couch.dev.oerfoundation.org/blog-feeds-hetzner/_design/ids/_view/by_site_and_wp_id?key=\[179,"66"\]'`
`curl 'https://bot:vUAQo58A8Fq9wq@couch.dev.oerfoundation.org/blog-feeds-hetzner/_design/ids/_view/by_site_id?key="66"'`
#!/usr/bin/python
# Copyright 2012 Open Education Resource Foundation
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import re
import time
from datetime import datetime
import couchdb
import feedparser
import sys
import json
import hashlib
import HTMLParser
# retrieve URL including authentication credentials from config JSON
options = json.load(open('./options.json', 'rt'))
couch = couchdb.Server(options['url'])
db = couch[options['db']]
h = HTMLParser.HTMLParser()
tags = options['tags']
# serial number of messages retrieved this poll to uniquify ID
serial = 0
# get the last time for a ask post in the database
view = db.view('ids/ask', descending=True, limit=1)
if len(view) == 1:
for row in view:
lasttime = row.key
else:
lasttime = "2000-01-01T00:00:00.000Z"
def gravatar(e):
return 'http://www.gravatar.com/avatar/' + \
hashlib.md5(e.strip().lower()).hexdigest() + '?s=48&d=identicon&r=pg'
def process_tag(tag):
def comment_or_answer(mo):
if mo.group(1) == 'Comment':
return 'Re:'
return 'Answer:'
global serial
feed = 'http://ask.oeruniversity.org/feeds/atom/?tags=' + tag
qfeed = 'http://ask.oeruniversity.org/feeds/questiona/%s/'
qpattern = re.compile(r'http://ask.OERuniversity.org/question/(?P<q>\d+)')
# find all of the questions
rss = feedparser.parse(feed)
qitems = rss['items']
qitems.reverse()
qs = []
for qitem in qitems:
mo = qpattern.match(qitem['link'])
if mo:
qs.append(mo.group('q'))
# for each of the questions, find the new questions, answers, comments
for q in qs:
rss = feedparser.parse(qfeed % q)
feedtitle = rss['channel']['title']
items = rss['items']
for item in items:
if item['title'] == 'RSS Error' and item['description'] == 'Error reading RSS data':
break
truncated = False
dt = datetime.strptime(item['date'], '%Y-%m-%dT%H:%M:%S+00:00')
we_timestamp = dt.strftime('%Y-%m-%dT%H:%M:%S.000Z')
if we_timestamp <= lasttime:
continue
seconds = time.mktime(dt.timetuple())
# strip out HTML markup before abridging, so we don't stop midtag
body = item['title'] + ' ' + item['summary']
abridged = re.sub(r'<[^>]*>', '', body)
abridged = h.unescape(abridged)
# remove square brackets (link anchors)
abridged = re.sub(r'\[|]', ' ', abridged)
abridged = re.sub(r'\s+', ' ', abridged)
# remove inline attribution, already have author
abridged = re.sub(r'(Comment|Answer) by (.*?) for',
comment_or_answer, abridged, 1)
abridged = abridged[:500].strip()
abridged = abridged.replace('&nbsp;', ' ')
abridged = abridged.replace('\n', ' ')
i = len(abridged)
if i > 137:
i = 137
while abridged[i] != ' ' and i > 0:
i -= 1
abridged = abridged[:i] + '...'
truncated = True
author = item['author_detail']['name']
mention = {
'from_user': author,
'from_user_name': author,
'created_at': item['date'],
'profile_image_url':
gravatar(item['author_detail']['email']),
'text': abridged,
'truncated': truncated,
'id': '%d%05d%03d' % (seconds, int(q), serial),
'profile_url': item['author_detail']['href'],
'we_source': 'ask',
'we_feed': '%s: %d' % (feedtitle, int(q)),
'we_tags': [tag],
'we_timestamp': we_timestamp,
'we_link': item['link']
}
if tag == 'sp4edu':
mention['we_tags'] = ['sp4ed']
print mention
print '==========='
db.save(mention)
serial += 1
for tag in tags:
process_tag(tag)
#!/usr/bin/python
# Copyright 2018 Open Education Resource Foundation
# developed by Dave Lane dave@oerfoundation.org, with help from
# code written by Jim Tittsler
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import re
import time
import calendar
from datetime import datetime
import cookielib
import urllib, urllib2
import couchdb
import feedparser
import lxml.html
import xml.sax
import json
# for debugging
import logging
import pprint
# to fix various unicode issues
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# retrieve URL including authentication credentials from config JSON
options = json.load(open('../config/options.json', 'rt'))
# version info
scanner = 'WEnotes Blog Feed Scanner'
scanner_version = '0.3'
# logging configuration
LogLevel = logging.DEBUG # or logging.INFO or logging.WARN, etc.
#LogLevel = logging.INFO # or logging.INFO or logging.WARN, etc.
LogFilename = options['logdir'] + '/blog_feeds.log'
LogFormat = '%(asctime)s - %(levelname)s: %(message)s'
print 'logfile %s, level %s' % (LogFilename, LogLevel)
logging.basicConfig(format=LogFormat,level=LogLevel,filename=LogFilename)
# configure the Couch db for mentions
couch = couchdb.Server(options['url'])
dbmentions = couch[options['db']]
# configure the Couch db for finding the feed details
couch2 = couchdb.Server(options['localcouch'])
dbfeeds = couch2[options['dbfeeds']]
# Set up the prettyprinter object for debugging
pp = pprint.PrettyPrinter(indent=4)
# return a tuple (text, truncated) of abridged text and truncated flag
def abridge(text):
truncated = False
# strip out HTML comments (and MSOffice conditionals)
abridged = re.sub(r'<!--.*?-->', '', text)
# strip out HTML markup before abridging,
# so we don't stop midtag
abridged = re.sub(r'<[^>]*>', ' ', abridged)
abridged = re.sub(r'\s*by [^.]+\.\n?', '', abridged)
abridged = abridged[:500].strip()
abridged = abridged.replace('&nbsp;', ' ')
abridged = abridged.replace('&#8211;', "--")
abridged = abridged.replace('&#8216;', "'")
abridged = abridged.replace('&#8217;', "'")
abridged = abridged.replace('&#8220;', '"')
abridged = abridged.replace('&#8221;', '"')
abridged = abridged.replace('&#8230;', "...")
abridged = abridged.replace('&#38;', "&")
abridged = abridged.replace('\n', ' ')
# get rid of multiple spaces (which the above may have introduced)
abridged = re.sub(r' +', ' ', abridged)
i = len(abridged)
if i > 200:
i = 200
while abridged[i] != ' ' and i > 0:
i -= 1
abridged = abridged[:i] + '...'
truncated = True
return (abridged, truncated)
def process_feed(feed):
new_docs = []
doc = feed.doc
logging.debug("== %s", doc['feed_url'])
try:
feed_last = doc['last_checked']
except KeyError:
feed_last = None
type = doc['feed_type']
url = doc['feed_url']
if type == 'application/rss+xml':
logging.debug('processing RSS feed %s', url)
content = feedparser.parse(url)
elif type == 'application/atom+xml':
logging.debug('processing Atom feed %s', url)
content = feedparser.parse(url)
else:
logging.debug('unsupported feed type %s ', type)
# print the feed...
logging.debug("------ the feed ------ %s", pp.pformat(feed));
# if there is no timestamp, probably no entries
#if not content.feed.has_key('updated_parsed') or content.feed['updated_parsed'] is None:
try:
if not content.feed.has_key('updated_parsed') or content.feed.has_key('updated_parsed') == None:
return None
except UnboundLocalError:
logging.debug('content object not defined')
return None
feed_timestamp = time.strftime('%Y-%m-%dT%H:%M:%S.000Z', content.feed['updated_parsed'])
# check last_updated of feed table
if feed_timestamp <= feed_last:
return None # no need to even bother checking items
# check last timestamp in the mentions database for this feed
items = content['items']
items.reverse()
for item in items:
# FIXME perhaps should query to see if article exists
# to avoid "updates" that change pub time
we_timestamp = time.strftime('%Y-%m-%dT%H:%M:%S.000Z', item['updated_parsed'])
if we_timestamp > feed_last:
truncated = False
text = ''
if len(item['title'].strip()) > 0:
text = item['title'].strip() + ': '
text = text + item['summary']
(abridged, truncated) = abridge(text)
mention = {
'from_user': doc['username'],
'from_user_name': doc['display_name'],
'created_at': item['updated'],
'profile_url': item['link'],
#'profile_image_url': doc['profile_image_url'],
'title': item['title'].strip(),
'text': abridged,
'truncated': truncated,
'id': calendar.timegm(item['updated_parsed']),
'we_source': 'feed',
'we_identifier': 'blog_feed',
'we_scanner': scanner,
'we_scanner_version': scanner_version,
'we_feed': content.feed['title'],
'we_feed_url': doc['feed_url'],
'we_timestamp': we_timestamp,
'we_link': item['link']
}
# if there is an id, use it instead of our made up one
if item.has_key('id'):
mention['id'] = item['id']
# if there is width and/or height, copy them
if doc.has_key('profile_image_width'):
mention['profile_image_width'] = doc['profile_image_width']
if doc.has_key('profile_image_height'):
mention['profile_image_height'] = doc['profile_image_height']
# if there is a gravatar hash, copy it
if doc.has_key('gravatar'):
mention['gravatar'] = doc['gravatar']
# if tags is empty, we take everything and apply we_tags
if len(doc['tags']) == 0:
mention['we_tags'] = doc['tags']
new_docs.append(mention)
else:
mention['we_tags'] = []
# only save things tagged with tags
# or that mention the tag in the title
keep = False
for we_tag in doc['tags']:
if item.has_key('tags'):
for tag in item['tags']:
logging.debug("==tag: %s we_tag: %s", tag, we_tag)
if tag.has_key('term') and tag['term'].lower().find(we_tag) > -1 and we_tag not in mention['we_tags']:
mention['we_tags'].append(we_tag)
continue
if text.lower().find(we_tag) > -1 and we_tag not in mention['we_tags']:
logging.debug("==we_tag: %s in text search", we_tag)
mention['we_tags'].append(we_tag)
# keep it if there was one or more interesting tags
if len(mention['we_tags']) > 0:
# see if we already have this one
# only update if tags have changed
existing = dbmentions.view('ids/feed')
exists = existing[mention['we_link']]
if len(exists) > 0:
# ideally there should be at most one...
for exi in exists:
if set(mention['we_tags']) <> set(exi['value']):
logging.debug("++ need to update tags, old>new %s %s", exi['value'], mention['we_tags'])
odoc = dbmentions[exi['id']]
odoc['we_tags'] = mention['we_tags']
dbmentions[exi['id']] = odoc
logging.debug("++updated tags in post")
else:
logging.debug("--old post already has all the tags")
pass
else:
logging.debug("??don't seem to have this post: %s", mention['we_tags'])
new_docs.append(mention)
else:
logging.debug("!!!Skipping post with no interesting tags: %s", doc['feed_url'])
#print "!!!", item
pass
#import pdb; pdb.set_trace()
if len(new_docs) > 0:
logging.info("**** updating %d new docs", len(new_docs))
result = dbmentions.update(new_docs)
logging.debug(" %s", result)
return time.strftime('%Y-%m-%dT%H:%M:%S.000Z', content.feed['updated_parsed'])
for feed in dbfeeds.view('_design/ids/_view/by_wp_id', include_docs=True):
try:
logging.debug('checking feed %s for references to %s', feed.doc['feed_url'], feed.doc['tags'])
try:
last_checked = process_feed(feed)
except xml.sax._exceptions.SAXException:
last_checked = None
except TypeError:
logging.debug('Exception hit - typeError %s', doc)
if last_checked:
logging.debug('doc _rev: %s', feed.doc['_rev'])
doc = feed.doc
doc['last_updated'] = last_checked
doc['last_successful'] = time.strftime(
'%Y-%m-%dT%H:%M:%S.000Z')
try:
dbfeeds[feed.id] = doc
except couchdb.http.ResourceConflict:
logging.debug('Exception hit - resourceConflict %s', doc['_id'])
except IndexError:
logging.debug('issue accessing relevant data on doc id %s', feed.doc['_id'])
logging.info("run finished\n")
#!/usr/bin/python
""" Harvest bookmarks with tags specified in options file."""
# Copyright 2017 Open Education Resource Foundation
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import re
import time
from datetime import datetime, timedelta
import couchdb
import urllib
import json
import feedparser
import argparse
import requests
import HTMLParser
import hashlib
# for debugging
import logging
import pprint
# to fix various unicode issues
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# version info
scanner = 'WEnotes Semantic Scuttle (Bookmarks) Scanner'
scanner_version = '0.4.1'
DEBUG = False
#DEBUG = True
DELAY = 1.0 # delay in seconds between Discourse hits
MAX_TEXT_LEN = 300 # max characters before truncation
# retrieve URL from config JSON
options = json.load(open('../config/options.json', 'r'))
couch = couchdb.Server(options['url'])
db = couch[options['db']]
# get tag list from URL
tagurl = options['settings-url']
jsoncontent = urllib.urlopen(tagurl)
reference_tags = json.loads(jsoncontent.read())["tags"]
# set header this app will report itself as
headers = {'User-Agent' : 'WEnotes-Bookmarks/0.1'}
# the URL of the bookmarks RSS feed
bookmarks_url = options["bookmarks"]["url"]
bookmarks_rss_url = bookmarks_url + '/rss.php/all'
# for parsing HTML in bookmark text
h = HTMLParser.HTMLParser()
# length of content for messages
message_length = 200;
#logging configuration
LogLevel = logging.DEBUG # or logging.INFO or logging.WARN, etc.
#LogLevel = logging.INFO # or logging.INFO or logging.WARN, etc.
LogFilename = options['logdir'] + '/bookmarks.log'
LogFormat = '%(asctime)s - %(levelname)s: %(message)s'
print 'logfile %s, level %s' % (LogFilename, LogLevel)
logging.basicConfig(format=LogFormat,level=LogLevel,filename=LogFilename)
# initialising
lasttime = "2000-01-01T00:00:00.000Z"
# setting up the parser for RSS
parser = argparse.ArgumentParser(description='Harvest posts from Bookmarks, our Semantic Scuttle instance.')
parser.add_argument('-f', '--full', action='store_false',
help='get list of categories, and then every topic in each')
args = parser.parse_args([])
# Set up the prettyprinter object for debugging
pp = pprint.PrettyPrinter(indent=4)
# get all the mention ids currently in the db so we can
# ensure we don't duplicate
all_mentions = db.view('ids/bookmarks')
logging.debug('avoiding these mentions we already have: %s', pp.pformat(all_mentions))
# check if we have this mention already
def have_mention(msg_id):
"""Return boolean showing if we already have this message."""
#print 'id = %s' % msg_id
for mention in all_mentions:
if msg_id==mention['value']:
#print 'Found id %s' % msg_id
return True
logging.debug('failed to find %s', msg_id)
return False
# deal with the +0000 time offset, not supported by datetime
# see https://stackoverflow.com/questions/23940551/why-z-is-not-supported-by-pythons-strptime
def dt_parse(t):
ret = datetime.strptime(t[0:25], '%a, %d %b %Y %H:%M:%S')
if t[26]=='+':
ret += timedelta(hours=int(t[27:29]))
elif t[26]=='-':
ret -= timedelta(hours=int(t[27:29]))
return ret
# strip any leading "#" from any tags
def striphashes(a):
for i in a:
if i[0] == '#':
i = i[1:]
return a
# find all of bookmarks
rss = feedparser.parse(bookmarks_rss_url)
# find the channel title
feedtitle = rss['channel']['title']
items = rss['items']
# reverse them, so oldest is first
items.reverse()
logging.debug("found %d items", len(items))
# for each item in RSS check if it has one (or more) of our tags
for item in items:
# is this an error item? If so, bail
if item['title'] == 'RSS Error' and item['description'] == 'Error reading RSS data':
break
logging.debug("looking at bookmark: %s", item['title'])
# is this an item with a relevant tag...
try:
taglist = [t['term'] for t in item['tags']]
# strip any leading "#" from tags
taglist = striphashes(taglist)
except:
logging.debug("no tags defined for %s", item['title'])
continue
common_tags = list(set(taglist) & set(reference_tags))
#logging.debug("common tags: %s", common_tags)
#logging.debug("taglist: %s\nreference tags: %s\ncommon tags: %s", taglist, reference_tags, common_tags)
logging.debug("taglist: %s\ncommon tags: %s", taglist, common_tags)
if not common_tags:
logging.debug("no common tags, not interested in this one");
continue
# initialise
truncated = False
dt = dt_parse(item['published'])
we_timestamp = dt.strftime('%Y-%m-%dT%H:%M:%S.000Z')
if we_timestamp <= lasttime:
logging.debug('the timestamp %s is before our reference time of %s... bailing. ', we_timestamp, lasttime);
continue
seconds = time.mktime(dt.timetuple())
# check if we've seen the gid before...
if have_mention(item['id']):
continue
# strip out HTML markup before abridging, so we don't stop midtag
body = item['title'] + ' ' + item['summary']
# pull out all the html tags
abridged = re.sub(r'<[^>]*>', '', body)
# remove any escaped tags
abridged = h.unescape(abridged)
# remove square brackets (link anchors)
abridged = re.sub(r'\[|]', ' ', abridged)
# remove multiple spaces
abridged = re.sub(r'\s+', ' ', abridged)
# remove line feeds and non-breaking spaces
abridged = abridged.replace('&nbsp;', ' ')