# Read in dataset
import pandas as pd
df = pd.read_csv('commons_file_name_change_log.tsv',sep='\t')
old_names = pd.DataFrame({'timestamp':df.log_timestamp, 'name': df.old_name, 'is_old':1})
new_names = pd.DataFrame({'timestamp':df.log_timestamp, 'name': df.new_name, 'is_old':0})
data = pd.concat([old_names, new_names], ignore_index=True)
# Remove filename extension
import os
data['extension'] = data['name'].apply(lambda x: os.path.splitext(x)[1])
data['name'] = data['name'].apply(lambda x: os.path.splitext(x)[0])
# Construct features

# count white spaces
def countSpaces(col):
    return len(col.split())-1
data['n_spaces'] = data.name.apply(countSpaces)

# proportion of Alpha characters
def alphaRatio(name):
    name = name.replace(' ','') #remove whiteSpaces
    total = len(name)
    alpha = sum(c.isalpha() for c in name)
    return alpha/total
data['alpha_ratio'] = data.name.apply(alphaRatio)

# ratio of Upper case letters
def isUpper(name):
    upper = sum(c.isupper() for c in name)
    total = len(name)
    return upper/total
data['upper_ratio'] = data.name.apply(isUpper)

# string length
data['name_length'] = data.name.apply(len)

# white space ratio
def ratioSpaces(col):
    return (len(col.split())-1)/len(col)
data['space_ratio'] = data.name.apply(ratioSpaces)

data.tail(10)
timestamp name is_old extension n_spaces alpha_ratio upper_ratio name_length space_ratio
545362 20171012202552 Sutton War Memorial (geograph 3013925, cropped... 0 .jpg 6 0.784314 0.052632 57 0.105263
545363 20171013200441 Bleach Green station, Whiteabbey (geograph 304... 0 .jpg 5 0.782609 0.058824 51 0.098039
545364 20171017193018 John Lynch and Ahkello Witherspoon 0 .jpg 4 1.000000 0.117647 34 0.117647
545365 20171018102515 Jaquiski Tartt 2017 0 .jpg 2 0.764706 0.105263 19 0.105263
545366 20171027011056 MOs810 WG 29 2017 Opolskie Zakamarki (in Arbor... 0 .jpg 10 0.800000 0.120000 75 0.133333
545367 20171029070311 The Madras treeshrew (Anathana ellioti) by Dav... 0 .jpg 7 0.940000 0.070175 57 0.122807
545368 20180124223331 Gabriel Bau 0 .jpg 1 1.000000 0.181818 11 0.090909
545369 20180208052720 Spolek podkrkonošských výtvarníků Trutnov (SPV... 0 .pdf 5 0.960000 0.109091 55 0.090909
545370 20190111170812 Zh-wō 0 .oga 0 0.800000 0.200000 5 0.000000
545371 20190131081524 Piazzale degli Uffizi 2013 (165453389) 0 .jpg 4 0.558824 0.052632 38 0.105263

Logistic Regression

X=data[data.columns[-5:]]
y=data.is_old

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2019)
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logistic_model = LogisticRegression()
logistic_model.fit(x_train, y_train)
y_pred = logistic_model.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
Accuracy: 0.9953426285177823
Precision: 0.9907441148604329
Recall: 1.0
# Confusion matrix
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
confusion_matrix(y_test, y_pred)
array([[67738,   635],
       [    0, 67970]])
# ROC curve
y_pred_prob = logistic_model.predict_proba(x_test)[::,1]
fpr, tpr,_=roc_curve(y_test,y_pred_prob,drop_intermediate=False)
roc_auc = roc_auc_score(y_test, y_pred_prob)

%matplotlib inline
import matplotlib.pyplot as plt
plt.figure()
plt.plot(fpr, tpr, color='darkorange',
         lw=2, label='ROC curve (area = %0.4f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend(loc="lower right")
plt.show()
# Coefficients
print(list(X))
print(logistic_model.coef_)
['n_spaces', 'alpha_ratio', 'upper_ratio', 'name_length', 'space_ratio']
[[-10.51005914  -5.32859871   4.35490908   0.04048372  -0.51010215]]

Run the model again with file renaming log since 2015

The file renaming criteria are established in 2014: https://commons.wikimedia.org/wiki/Commons:Requests_for_comment/File_renaming_criterion_2

X=data.loc[data.timestamp >= 20150000000000][data.columns[-5:]]
y=data.loc[data.timestamp >= 20150000000000].is_old
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2019)
logistic_model = LogisticRegression()
logistic_model.fit(x_train, y_train)
y_pred = logistic_model.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
Accuracy: 0.9961148667399292
Precision: 0.9922643386009504
Recall: 1.0
confusion_matrix(y_test, y_pred)
array([[44842,   350],
       [    0, 44895]])
print(list(X))
print(logistic_model.coef_)
['n_spaces', 'alpha_ratio', 'upper_ratio', 'name_length', 'space_ratio']
[[-11.20799317  -5.3812626    5.42293396   0.05808637  -0.530559  ]]

Check misclassified instances

index = 0
misclassified_index = []
for label, predict in zip(y_test, y_pred):
 if label != predict: 
   misclassified_index.append(y_test.iloc[[index]].index.values.astype(int)[0])
 index +=1
    
misclass_df = data.loc[misclassified_index,:]
misclass_df
timestamp name is_old extension n_spaces alpha_ratio upper_ratio name_length space_ratio
422121 20150726041653 Utrecht-Slangemuur-Rijksmonument-450498 0 .jpg 0 0.769231 0.076923 39 0.0
346117 20100605122548 Kruševac-ruins,Borak07291 0 .JPG 0 0.720000 0.080000 25 0.0
418010 20120929021853 Normandie 0 .jpg 0 1.000000 0.111111 9 0.0
276722 20121124044303 運乗寺 0 .jpg 0 1.000000 0.000000 3 0.0
493058 20120420053634 雁形目鸟类,摄于北京动物园,自拍,本图片不允许百度百科使用 0 .jpg 0 0.896552 0.000000 29 0.0
314601 20171103170005 杜湖全景 0 .jpg 0 1.000000 0.000000 4 0.0
543820 20110501142857 Rojca 0 .jpg 0 1.000000 0.200000 5 0.0
333036 20140511173850 Чатир-Даг 0 .jpg 0 0.888889 0.222222 9 0.0
308882 20161018184713 舊太原車站 0 .jpg 0 1.000000 0.000000 5 0.0
293058 20140529092936 MRT2SantolanStationExterior6 0 .jpg 0 0.928571 0.214286 28 0.0
444125 20170420223650 Philodromus-pjt 0 .jpg 0 0.933333 0.066667 15 0.0
418504 20161024012211 En-ca-peso 0 .oga 0 0.800000 0.100000 10 0.0
315011 20120913164637 Thegcast 0 .jpg 0 1.000000 0.125000 8 0.0
437036 20170106101100 岐阜工業高等専門学校第一体育館 0 .jpg 0 1.000000 0.000000 15 0.0
388651 20130215153448 Hubner1821SammlExotSchmett2Plate25 0 .jpg 0 0.794118 0.147059 34 0.0
275423 20150222173006 Фотосет 0 .jpg 0 1.000000 0.142857 7 0.0
410334 20181103181607 Dupont-Moretti 0 .jpg 0 0.928571 0.142857 14 0.0
380076 20170105185024 Ərkivan2 0 .jpg 0 0.875000 0.125000 8 0.0
289011 20140117142348 Burhave-2013-05-RaBoe-590 0 .jpg 0 0.480000 0.120000 25 0.0
276354 20121007191749 Hofburg-Ahnensaal 0 .jpg 0 0.941176 0.117647 17 0.0
371514 20130514135646 绮春园残桥 0 .jpg 0 1.000000 0.000000 5 0.0
416560 20180816111133 新竹大車站及新竹輕軌計畫模型 0 .jpg 0 1.000000 0.000000 14 0.0
288146 20131128125250 Bjarni-Benediktsson-public-radio-announcement-... 0 .jpg 0 0.820000 0.040000 50 0.0
466418 20111012045050 Walhalla-Ravine 0 .jpg 0 0.933333 0.133333 15 0.0
489882 20160907052615 서울시립남서울미술관 0 .jpg 0 1.000000 0.000000 10 0.0
380798 20120512093311 Xylota-pjt 0 .jpg 0 0.900000 0.100000 10 0.0
544093 20171021162814 AkiyamaRyuji3F4A0238 0 .jpg 0 0.700000 0.200000 20 0.0
282490 20140917221502 Grjótagjá-pjt1 0 .jpg 0 0.857143 0.071429 14 0.0
406404 20171108152554 Escudotorremenga 0 .png 0 1.000000 0.062500 16 0.0
315308 20170625160731 Aizoaceae 0 .jpg 0 1.000000 0.111111 9 0.0
... ... ... ... ... ... ... ... ... ...
370990 20180110114609 (1)Morella 0 .jpg 0 0.700000 0.100000 10 0.0
372318 20140715130622 Echinacea-purpurea-with-bumblebee-2 0 .jpg 0 0.857143 0.028571 35 0.0
274520 20130319234808 NagayamonParkMap 0 .jpg 0 1.000000 0.187500 16 0.0
298479 20111202194143 善化中山路街景 0 .jpg 0 1.000000 0.000000 7 0.0
490082 20170824162617 GriesSulztal 0 .jpg 0 1.000000 0.166667 12 0.0
359223 20130422122850 StPeterAndPaulRussoGreekCatholicOrthodoxChurch... 0 .jpg 0 0.981818 0.181818 55 0.0
409024 20180119204521 白沙坑文德宮牌樓 0 .jpg 0 1.000000 0.000000 8 0.0
401214 20180213195254 焉-bronze-warring 0 .svg 0 0.875000 0.000000 16 0.0
475753 20160113062649 Elk 0 .webm 0 1.000000 0.333333 3 0.0
537483 20130509064735 CCFL548Jameson@zebraSé(2012) 0 .jpg 0 0.642857 0.214286 28 0.0
380481 20120504190502 Koden-Saint-Michael-Archangel-orthodox-church,jpg 0 .jpg 0 0.877551 0.081633 49 0.0
385282 20140329002255 Rödbukspiraya 0 .jpg 0 1.000000 0.076923 13 0.0
278369 20130122164223 Flower-Kukherd 0 .jpg 0 0.928571 0.142857 14 0.0
308083 20150106192845 Molen-Geesina-Utrecht 0 .jpg 0 0.904762 0.142857 21 0.0
538330 20130427005513 CCFL548deFrente2012 0 .jpg 0 0.631579 0.263158 19 0.0
368219 20110407164939 PT50Pauline 0 .jpg 0 0.818182 0.272727 11 0.0
275400 20150221090925 Japchae 0 .jpg 0 1.000000 0.142857 7 0.0
446805 20170709165400 上田市立浦里小学校正門 0 .jpg 0 1.000000 0.000000 11 0.0
414188 20180605202311 2018福建高考长乐华侨中学考点考场物品存放处 0 .jpg 0 0.826087 0.000000 23 0.0
445972 20170611172330 CilViciaNarbonensis 0 .jpg 0 1.000000 0.157895 19 0.0
469812 20130329112511 BfSeddinSchaltkasten 0 .jpg 0 1.000000 0.150000 20 0.0
307242 20110228223807 太宰府天満宮の参道 0 .jpg 0 1.000000 0.000000 9 0.0
276714 20121124044023 小金井良精0016 0 .jpg 0 0.555556 0.000000 9 0.0
288191 20131202034649 BriarcliffLawPark2013 0 .tiff 0 0.809524 0.142857 21 0.0
297574 20110313233858 BoatUSVesselAssist 0 .jpg 0 1.000000 0.277778 18 0.0
369345 20170211205243 Metro-bruxelles 0 .jpg 0 0.933333 0.066667 15 0.0
415250 20180630231733 《國語留聲片課本》兩拼字全表 0 .jpg 0 0.857143 0.000000 14 0.0
480028 20160418192058 Herodotus1973 0 .jpg 0 0.692308 0.076923 13 0.0
414678 20180616094308 Velký-Osek-železniční-stanice 0 .jpg 0 0.896552 0.068966 29 0.0
544189 20131215085109 Euromaidan-in-Kyiv-12-14-094433 0 .JPG 0 0.516129 0.064516 31 0.0

635 rows × 9 columns

Cross validation and check misclassified instances

 

Detect language for misclassified instances

from guess_language import guess_language
from collections import Counter
out_lang = []
for name in misclass_df.name:
    out_lang.append(guess_language(name))
    
Counter(out_lang)
Counter({'nl': 7,
         'sl': 6,
         'UNKNOWN': 364,
         'zh': 106,
         'la': 16,
         'nb': 8,
         'it': 7,
         'ko': 2,
         'nr': 2,
         'ro': 3,
         'tl': 2,
         'ta': 3,
         'ceb': 3,
         'en': 9,
         'de': 11,
         'et': 7,
         'zu': 1,
         'id': 7,
         'hr': 3,
         'da': 4,
         'ja': 8,
         'pt': 4,
         'es': 4,
         'ca': 4,
         'sw': 1,
         'af': 5,
         'th': 5,
         'fr': 6,
         'sv': 4,
         'eo': 3,
         'fi': 4,
         'bn': 1,
         'is': 2,
         'ha': 2,
         'el': 1,
         'lv': 1,
         'eu': 3,
         'lt': 2,
         'ss': 1,
         'cy': 1,
         'tr': 1,
         'cs': 1})
# Any existing bot? Count file renaming by user ID
by_user = df.groupby(df.username).agg({'log_timestamp':'count'})
by_user['percentage'] = by_user.log_timestamp / sum(by_user.log_timestamp)
by_user.sort_values(by=['percentage'], ascending=False).head(20)
log_timestamp percentage
username
78178 0.286704
Wieralee 13976 0.051255
Retired electrician 8139 0.029848
Krassotkin 6717 0.024633
Uli Elch 5844 0.021432
Marcus Cyron 5722 0.020984
Rudolphous 4726 0.017332
BD2412 4170 0.015293
INeverCry 4087 0.014988
CAPTAIN RAJU 4042 0.014823
Bubo bubo 3806 0.013958
Courcelles 2805 0.010287
Bohème 2626 0.009630
Russavia 2575 0.009443
Zcarstvnz 2561 0.009392
JuTa 2318 0.008501
Faebot 2140 0.007848
Materialscientist 2082 0.007635
Brackenheim 2059 0.007551
Cobatfor 1990 0.007298