We can use CouchDB views to count twitter hashtags per day. I've used two views. The first view uses a mapper to map hashtags to a [YEAR, MONTH, DAY] tuple. The view can subsequently be queried hash tags for that date.
import couchdb
from couchdb.design import ViewDefinition
def time_hashtag_mapper(doc):
"""Hash tag by timestamp"""
from datetime import datetime
if doc.get('created_at'):
_date = doc['created_at']
else:
_date = 0 # Jan 1 1970
if doc.get('entities') and doc['entities'].get('hashtags'):
dt = datetime.fromtimestamp(_date).utctimetuple()
for hashtag in (doc['entities']['hashtags']):
yield([dt.tm_year, dt.tm_mon, dt.tm_mday],
hashtag['text'].lower())
view = ViewDefinition('index',
'time_hashtags',
time_hashtag_mapper,
language='python')
view.sync(db)
The second view maps each tweet to a tuple containing the [YEAR, MONTH, DAY, HASHTAG]. Then a reducer is used to count the tweets matching the tuple.
import couchdb
from couchdb.design import ViewDefinition
def date_hashtag_mapper(doc):
"""tweet by date+hashtag"""
from datetime import datetime
if doc.get('created_at'):
_date = doc['created_at']
else:
_date = 0 # Jan 1 1970
dt = datetime.fromtimestamp(_date).utctimetuple()
if doc.get('entities') and doc['entities'].get('hashtags'):
for hashtag in (doc['entities']['hashtags']):
yield ([dt.tm_year, dt.tm_mon, dt.tm_mday,
hashtag['text'].lower()],
doc['_id'])
def sumreducer(keys, values, rereduce):
"""count then sum"""
if rereduce:
return sum(values)
else:
return len(values)
view = ViewDefinition('index',
'daily_tagcount',
date_hashtag_mapper,
reduce_fun=sumreducer,
language='python')
view.sync(db)
Finally, query the first view to find tags for the day and then query the second view for tweet counts per tag for the day.
import sys
import couchdb
import time
from datetime import date, datetime
server = couchdb.Server('http://localhost:5984')
dbname = sys.argv[1]
db = server[dbname]
_date = sys.argv[2]
dt = datetime.strptime(_date,"%Y-%m-%d").utctimetuple()
# get tags for this time interval
_key = [dt.tm_year, dt.tm_mon, dt.tm_mday]
tags = [row.value for row in db.view('index/time_hashtags', key=_key)]
tags = list(set(tags))
print "Tags today",len(tags)
print ""
# get count for date and hashtag
for tag in sorted(tags):
_key = [dt.tm_year, dt.tm_mon, dt.tm_mday, tag]
tag_count = \
[ (row.value) for row in db.view('index/daily_tagcount', key=_key) ]
print "Found %d %s on %s-%s-%s "%\
(tag_count[0],tag,_key[0],_key[1],_key[2])
This code will evolve over time.
Find the complete codebase on github at: https://github.com/telvis07/twitter_mining. The develop branch has the latest stuff.