Category Archives: nosql

twitter mining: count hashtags per day

We can use CouchDB views to count twitter hashtags per day. I've used two views. The first view uses a mapper to map hashtags to a [YEAR, MONTH, DAY] tuple. The view can subsequently be queried hash tags for that date.

import couchdb
from couchdb.design import ViewDefinition

def time_hashtag_mapper(doc):
    """Hash tag by timestamp"""
    from datetime import datetime
    if doc.get('created_at'):
        _date = doc['created_at']
    else:
        _date = 0 # Jan 1 1970

    if doc.get('entities') and doc['entities'].get('hashtags'):
        dt = datetime.fromtimestamp(_date).utctimetuple()
        for hashtag in (doc['entities']['hashtags']):
            yield([dt.tm_year, dt.tm_mon, dt.tm_mday], 
                   hashtag['text'].lower())

view = ViewDefinition('index',
                      'time_hashtags',
                      time_hashtag_mapper,
                      language='python')
view.sync(db)

The second view maps each tweet to a tuple containing the [YEAR, MONTH, DAY, HASHTAG]. Then a reducer is used to count the tweets matching the tuple.

import couchdb
from couchdb.design import ViewDefinition

def date_hashtag_mapper(doc):
    """tweet by date+hashtag"""
    from datetime import datetime
    if doc.get('created_at'):
        _date = doc['created_at']
    else:
        _date = 0 # Jan 1 1970

    dt = datetime.fromtimestamp(_date).utctimetuple()
    if doc.get('entities') and doc['entities'].get('hashtags'):
        for hashtag in (doc['entities']['hashtags']):
            yield ([dt.tm_year, dt.tm_mon, dt.tm_mday, 
                    hashtag['text'].lower()], 
                   doc['_id'])

def sumreducer(keys, values, rereduce):
    """count then sum"""
    if rereduce:
        return sum(values)
    else:
        return len(values)

view = ViewDefinition('index',
                      'daily_tagcount',
                      date_hashtag_mapper,
                      reduce_fun=sumreducer,
                      language='python')
view.sync(db)

Finally, query the first view to find tags for the day and then query the second view for tweet counts per tag for the day.

import sys
import couchdb
import time
from datetime import date, datetime

server = couchdb.Server('http://localhost:5984')
dbname = sys.argv[1]
db = server[dbname]

_date  = sys.argv[2]
dt = datetime.strptime(_date,"%Y-%m-%d").utctimetuple()

# get tags for this time interval
_key = [dt.tm_year, dt.tm_mon, dt.tm_mday]
tags = [row.value for row in db.view('index/time_hashtags', key=_key)]
tags = list(set(tags))
print "Tags today",len(tags)
print ""

# get count for date and hashtag
for tag in sorted(tags):
    _key = [dt.tm_year, dt.tm_mon, dt.tm_mday, tag]
    tag_count = \
      [ (row.value) for row in db.view('index/daily_tagcount', key=_key) ]
    print "Found %d %s on %s-%s-%s "%\
      (tag_count[0],tag,_key[0],_key[1],_key[2])

This code will evolve over time.
Find the complete codebase on github at: https://github.com/telvis07/twitter_mining. The develop branch has the latest stuff.