It's useful to filter out "conversational" tweets and look for tweets with links to another page or picture, etc.
We create a view that only map tweets with link entities.
import couchdb from couchdb.design import ViewDefinition import sys def url_tweets_by_created_at(doc): if doc.get('created_at'): _date = doc['created_at'] else: _date = 0 # Jan 1 1970 if doc.get('entities') and doc['entities'].get('urls') and len(doc['entities']['urls']): if doc.get('user'): yield (_date, doc) view = ViewDefinition('index', 'daily_url_tweets', url_tweets_by_created_at, language='python') view.sync(db)
Next we create an app that reads from this view and displays the results.
import couchdb from datetime import datetime def run(db, date, limit=10): """Query a couchdb view for tweets. Sort in memory by follower count. Return the top 10 tweeters and their tweets""" print "Finding top %d tweeters"%limit dt = datetime.strptime(date,"%Y-%m-%d") stime=int(time.mktime(dt.timetuple())) etime=stime+86400-1 tweeters = {} tweets = {} # get screen_name, follower_counts and tweet ids for looking up later for row in db.view('index/daily_url_tweets', startkey=stime, endkey=etime): status = row.value screen_name = status['user']['screen_name'] followers_count = status['user']['followers_count'] tweeters[screen_name] = int(followers_count) if not tweets.has_key(screen_name): tweets[screen_name] = [] tweets[screen_name].append(status['id_str']) # sort print len(tweeters.keys()) di = tweeters.items() di.sort(key=lambda x: x[1], reverse=True) out = {} for i in range(limit): screen_name = di[i][0] followers_count = di[i][1] out[screen_name] = {} out[screen_name]['follower_count'] = followers_count out[screen_name]['tweets'] = {} # print i,screen_name,followers_count for tweetid in tweets[screen_name]: status = db[tweetid] text = status['orig_text'] # print tweetid,orig_text urls = status['entities']['urls'] #name = status['user']['name'] for url in urls: text = text.replace(url['url'],url['expanded_url']) out[screen_name]['tweets'][tweetid] = text return out server = couchdb.Server('http://localhost:5984') db = server[dbname] date = '2012-03-05' output = run(db, date)
Find the complete codebase on github at: https://github.com/telvis07/twitter_mining