We can use CouchDB views to count twitter hashtags per day. I've used two views. The first view uses a mapper to map hashtags to a [YEAR, MONTH, DAY] tuple. The view can subsequently be queried hash tags for that date.
import couchdb from couchdb.design import ViewDefinition def time_hashtag_mapper(doc): """Hash tag by timestamp""" from datetime import datetime if doc.get('created_at'): _date = doc['created_at'] else: _date = 0 # Jan 1 1970 if doc.get('entities') and doc['entities'].get('hashtags'): dt = datetime.fromtimestamp(_date).utctimetuple() for hashtag in (doc['entities']['hashtags']): yield([dt.tm_year, dt.tm_mon, dt.tm_mday], hashtag['text'].lower()) view = ViewDefinition('index', 'time_hashtags', time_hashtag_mapper, language='python') view.sync(db)
The second view maps each tweet to a tuple containing the [YEAR, MONTH, DAY, HASHTAG]. Then a reducer is used to count the tweets matching the tuple.
import couchdb from couchdb.design import ViewDefinition def date_hashtag_mapper(doc): """tweet by date+hashtag""" from datetime import datetime if doc.get('created_at'): _date = doc['created_at'] else: _date = 0 # Jan 1 1970 dt = datetime.fromtimestamp(_date).utctimetuple() if doc.get('entities') and doc['entities'].get('hashtags'): for hashtag in (doc['entities']['hashtags']): yield ([dt.tm_year, dt.tm_mon, dt.tm_mday, hashtag['text'].lower()], doc['_id']) def sumreducer(keys, values, rereduce): """count then sum""" if rereduce: return sum(values) else: return len(values) view = ViewDefinition('index', 'daily_tagcount', date_hashtag_mapper, reduce_fun=sumreducer, language='python') view.sync(db)
Finally, query the first view to find tags for the day and then query the second view for tweet counts per tag for the day.
import sys import couchdb import time from datetime import date, datetime server = couchdb.Server('http://localhost:5984') dbname = sys.argv[1] db = server[dbname] _date = sys.argv[2] dt = datetime.strptime(_date,"%Y-%m-%d").utctimetuple() # get tags for this time interval _key = [dt.tm_year, dt.tm_mon, dt.tm_mday] tags = [row.value for row in db.view('index/time_hashtags', key=_key)] tags = list(set(tags)) print "Tags today",len(tags) print "" # get count for date and hashtag for tag in sorted(tags): _key = [dt.tm_year, dt.tm_mon, dt.tm_mday, tag] tag_count = \ [ (row.value) for row in db.view('index/daily_tagcount', key=_key) ] print "Found %d %s on %s-%s-%s "%\ (tag_count[0],tag,_key[0],_key[1],_key[2])
This code will evolve over time.
Find the complete codebase on github at: https://github.com/telvis07/twitter_mining. The develop branch has the latest stuff.