Investigate HashingVectorizer

This notebook was created as part of the following tasks:

About HashingVectorizer

See http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.HashingVectorizer.html

It turns a collection of text documents into a scipy.sparse matrix holding token occurrence counts (or binary occurrence information), possibly normalized as token frequencies if norm=’l1’ or projected on the euclidean unit sphere if norm=’l2’.

Conventions and terminologies

hv_features = hashed features other_features = existing 77 features

import csv
import pickle
import pprint
import random
import mwapi
import time
import sys
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.externals import joblib
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.feature_selection import SelectFromModel
from sklearn.grid_search import GridSearchCV
from scipy.sparse import coo_matrix, vstack, hstack
import numpy as np
import sqlite3
import hashlib
pp = pprint.PrettyPrinter(indent=4)
def open_db(db_name):
    conn = sqlite3.connect(db_name)
    conn.isolation_level = None;
    return conn

def create_sqlite_tables():
    # data.db - tsv is exported here
    conn = open_db('data.db')
    c = conn.cursor()

    # create observations table
    c.execute('''CREATE TABLE IF NOT EXISTS observations
    (revid INTEGER PRIMARY KEY, other_features TEXT, is_damaging INTEGER)''')

    # create content table
    c.execute('''CREATE TABLE IF NOT EXISTS content
    (revid INTEGER PRIMARY KEY, revid_parent INTEGER, content_current BLOB, content_parent BLOB)''')

    conn.close()

    # features db
    conn = open_db('features.db')
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS feature_vector
    (revid INTEGER PRIMARY KEY, current BLOB, parent BLOB, diff BLOB, other_features BLOB, is_damaging INTEGER)''')
    conn.close()

    # score db
    # TODO - verify if we are populating other_features if features.db
    # is newly being created
    conn = open_db('score.db')
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS score
    (revid INTEGER PRIMARY KEY, is_damaging_actual INTEGER, is_damaging_prediction INTEGER, score_positive REAL)''')
    conn.close()

def fix_data_type(i):
    if i == 'True':
        return True
    if i == 'False':
        return False
    if i.isdigit() == True:
        return int(i)
    else:
        return float(i)
    
def read_tsv(fileobj):
    tsvin = csv.reader(fileobj, delimiter='\t')
    for row in tsvin:
        yield row
        
def export_tsv_to_sqlite():
    create_sqlite_tables()
    conn = open_db('data.db')
    c = conn.cursor()

    filename = 'enwiki.features_damaging.20k_2015.tsv'
    f = open(filename,'rt')
    i = 1
    for row in read_tsv(f):
        print(i)
        i = i + 1
        print(row)
        other_features = list(map(fix_data_type, row))
        print(other_features)
        break
        #other_features = pickle.dumps(row[1:-1]) #map(fix_data_type, row[1:-1])
        # other_features = pickle.dumps(map(fix_data_type, row[1:-1]))
        c.execute('''INSERT INTO observations
        (revid, other_features, is_damaging)
        VALUES (?, ?, ?)''', (row[0], other_features, row[-1]))
    conn.commit()
    conn.close()    
create_sqlite_tables()
export_tsv_to_sqlite()
1
['644933637', 'False', 'False', 'False', '11.487669193554064', '9.701493759944652', '8.358900612421644', '2.8903717578961645', '4.804021044733257', '5.924255797414532', '6.354370040797351', '0.6931471805599453', '0.0', '22.85094914459808', '0.26112233033474086', '0.00398406374501992', '0.18138424821002386', '62.0', '62.0', '0.0', '0.16181375410146775', '0.16181375410146775', '0.0', '2.0', '2.0', '0.0', '0.5357142857142857', '0.5357142857142857', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '1758', '319', '83', '83', '4', '10', '8', '0', '0', '1', '1', 'True', 'False', 'False', 'False', 'False', '19.345125981288703', 'False', 'False', '0', '0', '0', '0.0', '0.0', '0.0', '0', '0', '0', '0.0', '0.0', '0.0', '63', '63', '0', '2.596230726772481', '2.596230726772481', '0.0', '20', '20', '0', '11.046029402323938', '11.046029402323938', '0.0', 'False']
[644933637, False, False, False, 11.487669193554064, 9.701493759944652, 8.358900612421644, 2.8903717578961645, 4.804021044733257, 5.924255797414532, 6.354370040797351, 0.6931471805599453, 0.0, 22.85094914459808, 0.26112233033474086, 0.00398406374501992, 0.18138424821002386, 62.0, 62.0, 0.0, 0.16181375410146775, 0.16181375410146775, 0.0, 2.0, 2.0, 0.0, 0.5357142857142857, 0.5357142857142857, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1758, 319, 83, 83, 4, 10, 8, 0, 0, 1, 1, True, False, False, False, False, 19.345125981288703, False, False, 0, 0, 0, 0.0, 0.0, 0.0, 0, 0, 0, 0.0, 0.0, 0.0, 63, 63, 0, 2.596230726772481, 2.596230726772481, 0.0, 20, 20, 0, 11.046029402323938, 11.046029402323938, 0.0, False]
filename = 'enwiki.features_damaging.20k_2015.tsv'

data =  np.loadtxt(fname = filename, delimiter = '\t')
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-23-ff431b0f2344> in <module>()
      1 filename = 'enwiki.features_damaging.20k_2015.tsv'
      2 
----> 3 data =  np.loadtxt(fname = filename, delimiter = '\t')

/srv/paws/lib/python3.4/site-packages/numpy/lib/npyio.py in loadtxt(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin)
    928 
    929             # Convert each value according to its column and store
--> 930             items = [conv(val) for (conv, val) in zip(converters, vals)]
    931             # Then pack it according to the dtype's nesting
    932             items = pack_items(items, packing)

/srv/paws/lib/python3.4/site-packages/numpy/lib/npyio.py in <listcomp>(.0)
    928 
    929             # Convert each value according to its column and store
--> 930             items = [conv(val) for (conv, val) in zip(converters, vals)]
    931             # Then pack it according to the dtype's nesting
    932             items = pack_items(items, packing)

/srv/paws/lib/python3.4/site-packages/numpy/lib/npyio.py in floatconv(x)
    657         if b'0x' in x:
    658             return float.fromhex(asstr(x))
--> 659         return float(x)
    660 
    661     typ = dtype.type

ValueError: could not convert string to float: b'False'