Gini index

Analysis for Gini index. The threshold range is between 0.95 and 1

%matplotlib inline

import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt

Load the dataset... It's in multiple files.

import codecs
import json
import collections
from os import listdir
from os.path import isfile, join
files = [f for f in listdir("gini_results") if isfile(join("gini_results", f))]

values = {}
for f in files:
    threshold = float(f.split("_")[1])
    rows = []
    dataset = codecs.open("gini_results/"+f, "r", "utf-8")
    for line in dataset:
        row = json.loads(line)
        rows.append(row)
    values[threshold] = pd.DataFrame.from_dict(rows)
values[0.95].head()
article_id category found isa
0 36634326 People_by_occupation_and_nationality True True
1 43128890 History_by_topic_and_country False False
2 6669247 Broadcasting_stations_and_networks True False
3 14517666 Music_by_geographical_categorization True True
4 28964101 Conceptual_systems True False

Schema:
'found': the prediction
'isa': sample label

len(values)
250

Compute the scores

scores_list = []
for t, v in values.items():
    true_positive = v[(v.found==True) & (v.isa == True)]
    true_negative = v[(v.found==False) & (v.isa == False)]
    false_positive = v[(v.found==True) & (v.isa == False)]
    false_negative = v[(v.found==False) & (v.isa == True)]
    predicted_positive = v[v.found==True]
    real_positive = v[v.isa==True]
    """ Precision & recall"""
    precision = len(true_positive)/len(predicted_positive)
    recall = len(true_positive)/len(real_positive)
    """ True/False positive rate """
    true_positive_rate = len(true_positive)/(len(real_positive)+len(false_negative))
    false_positive_rate = len(false_positive)/(len(false_positive)+len(true_negative))
    """ Dataframe row """
    scores_list.append({"threshold": t, "precision": precision, "recall": recall, \
                        "tpr": true_positive_rate, "fpr": false_positive_rate})

scores = pd.DataFrame.from_dict(scores_list).sort_values(by="threshold").reset_index(drop=True)
scores.head()
fpr precision recall threshold tpr
0 0.735354 0.362522 0.96729 0.9500 0.936652
1 0.735354 0.362522 0.96729 0.9502 0.936652
2 0.735354 0.362522 0.96729 0.9504 0.936652
3 0.731313 0.363796 0.96729 0.9506 0.936652
4 0.741414 0.360627 0.96729 0.9508 0.936652

Precision vs. Recall

scores.plot(x="threshold", y=["precision", "recall"]).grid(True)
scores.plot(x="threshold", y="precision").grid(True)
scores.plot(x="threshold", y="recall").grid(True)

Precision vs Recall

scores.sort_values(by="precision").plot(x="precision", y="recall").grid(True)

ROC CURVE

scores.sort_values(by="fpr").plot(x="fpr", y="tpr").grid(True)