Category Archives: python

Kaggle Credit Risk Competition

Kaggle Competition Goal

Detect which loans are at risk of default using credit application data and 3rd party credit data.

My Approach

Fetch the Kaggle competition data from the Home Credit Default Risk Competition, generate numeric and categorical features then build models using Tensorflow, Scikit-Learn and XGBoost.

Github

See my kaggle_credit_risk github repo to view the source for generating features, training models and running model experiments.

twitter mining: count hashtags per day

We can use CouchDB views to count twitter hashtags per day. I've used two views. The first view uses a mapper to map hashtags to a [YEAR, MONTH, DAY] tuple. The view can subsequently be queried hash tags for that date.

import couchdb
from couchdb.design import ViewDefinition

def time_hashtag_mapper(doc):
    """Hash tag by timestamp"""
    from datetime import datetime
    if doc.get('created_at'):
        _date = doc['created_at']
    else:
        _date = 0 # Jan 1 1970

    if doc.get('entities') and doc['entities'].get('hashtags'):
        dt = datetime.fromtimestamp(_date).utctimetuple()
        for hashtag in (doc['entities']['hashtags']):
            yield([dt.tm_year, dt.tm_mon, dt.tm_mday], 
                   hashtag['text'].lower())

view = ViewDefinition('index',
                      'time_hashtags',
                      time_hashtag_mapper,
                      language='python')
view.sync(db)

The second view maps each tweet to a tuple containing the [YEAR, MONTH, DAY, HASHTAG]. Then a reducer is used to count the tweets matching the tuple.

import couchdb
from couchdb.design import ViewDefinition

def date_hashtag_mapper(doc):
    """tweet by date+hashtag"""
    from datetime import datetime
    if doc.get('created_at'):
        _date = doc['created_at']
    else:
        _date = 0 # Jan 1 1970

    dt = datetime.fromtimestamp(_date).utctimetuple()
    if doc.get('entities') and doc['entities'].get('hashtags'):
        for hashtag in (doc['entities']['hashtags']):
            yield ([dt.tm_year, dt.tm_mon, dt.tm_mday, 
                    hashtag['text'].lower()], 
                   doc['_id'])

def sumreducer(keys, values, rereduce):
    """count then sum"""
    if rereduce:
        return sum(values)
    else:
        return len(values)

view = ViewDefinition('index',
                      'daily_tagcount',
                      date_hashtag_mapper,
                      reduce_fun=sumreducer,
                      language='python')
view.sync(db)

Finally, query the first view to find tags for the day and then query the second view for tweet counts per tag for the day.

import sys
import couchdb
import time
from datetime import date, datetime

server = couchdb.Server('http://localhost:5984')
dbname = sys.argv[1]
db = server[dbname]

_date  = sys.argv[2]
dt = datetime.strptime(_date,"%Y-%m-%d").utctimetuple()

# get tags for this time interval
_key = [dt.tm_year, dt.tm_mon, dt.tm_mday]
tags = [row.value for row in db.view('index/time_hashtags', key=_key)]
tags = list(set(tags))
print "Tags today",len(tags)
print ""

# get count for date and hashtag
for tag in sorted(tags):
    _key = [dt.tm_year, dt.tm_mon, dt.tm_mday, tag]
    tag_count = \
      [ (row.value) for row in db.view('index/daily_tagcount', key=_key) ]
    print "Found %d %s on %s-%s-%s "%\
      (tag_count[0],tag,_key[0],_key[1],_key[2])

This code will evolve over time.
Find the complete codebase on github at: https://github.com/telvis07/twitter_mining. The develop branch has the latest stuff.

twitter mining by geolocation

Twitter's streaming api permits filtering tweets by geolocation. According to the api documentation, only tweets that are created using the Geotagging API can be filtered. The code below uses tweepy to filter tweets for the San Francisco area.

#!/usr/bin/env python
import tweepy
import ConfigParser
import os, sys

class Listener(tweepy.StreamListener):
    def on_status(self, status):
        print "screen_name='%s' tweet='%s'"%(status.author.screen_name, status.text)

def login(config):
    """Tweepy oauth dance
    The config file should contain:

    [auth]
    CONSUMER_KEY = ...
    CONSUMER_SECRET = ...
    ACCESS_TOKEN = ...
    ACCESS_TOKEN_SECRET = ...
    """     
    CONSUMER_KEY = config.get('auth','CONSUMER_KEY')
    CONSUMER_SECRET = config.get('auth','CONSUMER_SECRET')
    ACCESS_TOKEN = config.get('auth','ACCESS_TOKEN')
    ACCESS_TOKEN_SECRET = config.get('auth','ACCESS_TOKEN_SECRET')
    
    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
    return auth


fn=sys.argv[1]
config = ConfigParser.RawConfigParser()
config.read(fn)
try:
    auth = login(config)
    streaming_api = tweepy.streaming.Stream(auth, Listener(), timeout=60)
    # San Francisco area.
    streaming_api.filter(follow=None, locations=[-122.75,36.8,-121.75,37.8]) 
except KeyboardInterrupt:
    print "got keyboardinterrupt"

Find the complete codebase on github at: https://github.com/telvis07/twitter_mining

twitter mining: top tweets with links

It's useful to filter out "conversational" tweets and look for tweets with links to another page or picture, etc.

We create a view that only map tweets with link entities.

import couchdb
from couchdb.design import ViewDefinition
import sys

def url_tweets_by_created_at(doc):
    if doc.get('created_at'):
        _date = doc['created_at']
    else:
        _date = 0 # Jan 1 1970

    if doc.get('entities') and doc['entities'].get('urls') 
      and len(doc['entities']['urls']):
        if doc.get('user'):
            yield (_date, doc)

view = ViewDefinition('index', 'daily_url_tweets', 
                      url_tweets_by_created_at, language='python')
view.sync(db)

Next we create an app that reads from this view and displays the results.

import couchdb
from datetime import datetime

def run(db, date, limit=10):
    """Query a couchdb view for tweets. Sort in memory by follower count.
    Return the top 10 tweeters and their tweets"""
    print "Finding top %d tweeters"%limit

    dt = datetime.strptime(date,"%Y-%m-%d")
    stime=int(time.mktime(dt.timetuple()))
    etime=stime+86400-1
    tweeters = {}
    tweets = {}
    # get screen_name, follower_counts and tweet ids for looking up later
    for row in db.view('index/daily_url_tweets', startkey=stime, endkey=etime):
        status = row.value
        screen_name = status['user']['screen_name']
        followers_count = status['user']['followers_count']
        tweeters[screen_name] = int(followers_count)
        if not tweets.has_key(screen_name):
            tweets[screen_name] = []
        tweets[screen_name].append(status['id_str'])

    # sort
    print len(tweeters.keys())
    di = tweeters.items()
    di.sort(key=lambda x: x[1], reverse=True)
    out = {}
    for i in range(limit):
        screen_name = di[i][0]
        followers_count = di[i][1]
        out[screen_name] = {}
        out[screen_name]['follower_count'] = followers_count
        out[screen_name]['tweets'] = {}
        # print i,screen_name,followers_count
        for tweetid in tweets[screen_name]:
            status = db[tweetid]
            text = status['orig_text']
            # print tweetid,orig_text
            urls = status['entities']['urls']
            #name = status['user']['name']
            for url in urls:
                text = text.replace(url['url'],url['expanded_url'])
            out[screen_name]['tweets'][tweetid] = text

    return out

server = couchdb.Server('http://localhost:5984')
db = server[dbname]
date = '2012-03-05'
output = run(db, date)

Find the complete codebase on github at: https://github.com/telvis07/twitter_mining

twitter mining: top tweets by follower count

We can find interesting tweets using the author's follower count and tweet timestamp. We store tweets using CouchDB and search for tweets using tweepy streaming. With these tools we can find the top N tweets per day. The code below uses the couchpy view server to write a view in python. The steps to setup couchpy are found here. Basically, you add the following to /etc/couchdb/local.ini and install couchpy.

Install couchpy and couchdb-python with the following command.

pip install couchdb

Test couchpy is installed.

$ which couchpy
/usr/bin/couchpy

Edit /etc/couchdb/local.ini

[query_servers]
python=/usr/bin/couchpy

This a simple view mapper that maps each tweet to a timestamp so we can query by start and end time.


import couchdb
from couchdb.design import ViewDefinition
import sys

server = couchdb.Server('http://localhost:5984')
db = sys.argv[1]
db = server[db]

def tweets_by_created_at(doc):
    if doc.get('created_at'):
        _date = doc['created_at']
    else:
        _date = 0 # Jan 1 1970
    
    if doc.get('user'):
        yield (_date, doc) 
        
view = ViewDefinition('index', 'daily_tweets', tweets_by_created_at, language='python')
view.sync(db)

The code below queries the view for all tweets within a date range. Then we sort in memory by the follower count.

import couchdb
from datetime import datetime

def run(db, date, limit=10):
    """Query a couchdb view for tweets. Sort in memory by follower count.
    Return the top 10 tweeters and their tweets"""
    print "Finding top %d tweeters"%limit
        
    dt = datetime.strptime(date,"%Y-%m-%d")
    stime=int(time.mktime(dt.timetuple()))
    etime=stime+86400-1
    tweeters = {}
    tweets = {}
    for row in db.view('index/daily_tweets', startkey=stime, endkey=etime):
        status = row.value
        screen_name = status['user']['screen_name']
        followers_count = status['user']['followers_count']
        tweeters[screen_name] = int(followers_count)
        if not tweets.has_key(screen_name):
            tweets[screen_name] = []
        tweets[screen_name].append(status['id_str'])
        
    # sort
    di = tweeters.items() 
    di.sort(key=lambda x: x[1], reverse=True)
    out = {}
    for i in range(limit):
        screen_name = di[i][0]
        followers_count = di[i][1]
        out[screen_name] = {}
        out[screen_name]['follower_count'] = followers_count
        out[screen_name]['tweets'] = {}
        # print i,screen_name,followers_count
        for tweetid in tweets[screen_name]:
            orig_text = db[tweetid]['orig_text']
            # print tweetid,orig_text
            out[screen_name]['tweets'][tweetid] = orig_text

    return out

server = couchdb.Server('http://localhost:5984')
db = server[dbname]
date = '2012-03-05'
output = run(db, date)

Find the complete codebase on github at: https://github.com/telvis07/twitter_mining