It's useful to filter out "conversational" tweets and look for tweets with links to another page or picture, etc.
We create a view that only map tweets with link entities.
import couchdb
from couchdb.design import ViewDefinition
import sys
def url_tweets_by_created_at(doc):
if doc.get('created_at'):
_date = doc['created_at']
else:
_date = 0 # Jan 1 1970
if doc.get('entities') and doc['entities'].get('urls')
and len(doc['entities']['urls']):
if doc.get('user'):
yield (_date, doc)
view = ViewDefinition('index', 'daily_url_tweets',
url_tweets_by_created_at, language='python')
view.sync(db)
Next we create an app that reads from this view and displays the results.
import couchdb
from datetime import datetime
def run(db, date, limit=10):
"""Query a couchdb view for tweets. Sort in memory by follower count.
Return the top 10 tweeters and their tweets"""
print "Finding top %d tweeters"%limit
dt = datetime.strptime(date,"%Y-%m-%d")
stime=int(time.mktime(dt.timetuple()))
etime=stime+86400-1
tweeters = {}
tweets = {}
# get screen_name, follower_counts and tweet ids for looking up later
for row in db.view('index/daily_url_tweets', startkey=stime, endkey=etime):
status = row.value
screen_name = status['user']['screen_name']
followers_count = status['user']['followers_count']
tweeters[screen_name] = int(followers_count)
if not tweets.has_key(screen_name):
tweets[screen_name] = []
tweets[screen_name].append(status['id_str'])
# sort
print len(tweeters.keys())
di = tweeters.items()
di.sort(key=lambda x: x[1], reverse=True)
out = {}
for i in range(limit):
screen_name = di[i][0]
followers_count = di[i][1]
out[screen_name] = {}
out[screen_name]['follower_count'] = followers_count
out[screen_name]['tweets'] = {}
# print i,screen_name,followers_count
for tweetid in tweets[screen_name]:
status = db[tweetid]
text = status['orig_text']
# print tweetid,orig_text
urls = status['entities']['urls']
#name = status['user']['name']
for url in urls:
text = text.replace(url['url'],url['expanded_url'])
out[screen_name]['tweets'][tweetid] = text
return out
server = couchdb.Server('http://localhost:5984')
db = server[dbname]
date = '2012-03-05'
output = run(db, date)
Find the complete codebase on github at: https://github.com/telvis07/twitter_mining