import pymysql
import datetime
import pywikibot
from ipywidgets import interact
conn = pymysql.connect(host='paws-db', user='cscw', password='cscw')
c = conn.cursor()
someusers = moreusers(28000000)
user_sessions = dict()

while(True):
    someusers_string = next(someusers)
    print(someusers_string)
    c.execute(someusers_string)
    results = c.fetchall()
    clean_results = map(lambda t: t[0], results)
    str_clean = [res.decode("utf-8")  for res in clean_results]
    #print(str_clean)

    for user in str_clean:
        #print(user)
        try:
            user_sessions[user] = session_lens(user)
        except UnicodeEncodeError:
            pass
len(user_sessions)
user_sessions
user_sessions_nonempty = [(user, sessions) for (user, sessions) in user_sessions.items() if sessions]
len(user_sessions_nonempty)
sessions_nameless = [i for u,i in user_sessions_nonempty]
userdf = pandas.DataFrame.from_records(user_sessions_nonempty,columns=['user','sessionlist'])
userdf.head()
def first_session_long(sessionlist,second_cutoff):
    first_session = sessionlist[0]
    try:
        second_session = sessionlist[1]
    except:
        return first_session[1]>second_cutoff
    return first_session[1]>second_cutoff or second_session[1]>second_cutoff #secondpart is seconds
userdf['longfirst']=userdf['sessionlist'].apply(lambda x: first_session_long(x, second_cutoff=0))
longfirst_data = userdf[userdf['longfirst']==True]['sessionlist'].sum()
longfirst_sessiondyn = pandas.DataFrame.from_records(longfirst_data,columns=(['sessionnumber','sessionlen']))

shortfirst_data = userdf[userdf['longfirst']==False]['sessionlist'].sum()
shortfirst_sessiondyn = pandas.DataFrame.from_records(shortfirst_data,columns=(['sessionnumber','sessionlen']))
longfirst_sessiondyn.plot(kind='scatter', x='sessionnumber',y='sessionlen',title="Session lengths of users \n whose first or second sessions > 1 second")
shortfirst_sessiondyn.plot(kind='scatter', x='sessionnumber',y='sessionlen', title="Session lengths of users \n whose first or second session are 1 edit only")
userdf['totalsessions'] = userdf['sessionlist'].apply(lambda x: len(x))
 
userdf[userdf['longfirst']==False].mean()

flat_sessions = [] for user, sessionlist in user_sessions_nonempty: numsessions = len(sessionlist) for session in sessionlist: lsession = list(session) triple = lsession+[numsessions] flat_sessions.append((user, triple))

user_file = open('/home/paws/user_session_lens.json','w')
user_file_nameless = open('/home/paws/user_session_lens_nameless.json','w')
import csv
import json
json.dump(user_sessions_nonempty, user_file)
json.dump(flat_sessions, user_file_nameless)
import pandas
sessionsdf = pandas.DataFrame.from_records(flat_sessions, columns=['sessionnumber', 'sessionlen','totalusersessions'])
sessionsdf
sessionsdf.to_csv('sessions.csv', index=False)
import matplotlib.pyplot as plt
%pylab inline
sessionsdf.plot(kind='scatter', y='sessionlen', x='sessionnumber',xlim=(0,20))
sessionsdf[sessionsdf['totalusersessions']<20].plot(kind='scatter', y='sessionlen', x='sessionnumber',xlim=(0,20), ylim=(0,30000))
sessionsdf[(sessionsdf['totalusersessions']<20) & (sessionsdf['totalusersessions']>10)].plot(kind='scatter', y='sessionlen', x='sessionnumber',xlim=(0,20), ylim=(0,30000))
sessionsdf[sessionsdf['totalusersessions']>=20].plot(kind='scatter', y='sessionlen', x='sessionnumber', xlim=(0,20), ylim=(0,30000))
sessionsdf.plot(kind='scatter', y='sessionlen', x='totalusersessions', xlim=(0,200))
sessionsdf[sessionsdf['sessionnumber']==0] 
sessionsdf[shortfirst].plot(kind='scatter', y='sessionlen', x='sessionnumber')

user, start, end, session_number, number_revs_in_session

def session_len(session):
    start, end = [pywikibot.Timestamp.fromtimestampformat(time.decode('utf-8')) for time in (session[1], session[2])]
    slength = end-start
    return session[3], slength.seconds #sessionnumber, sessionlenght in seconds

def session_lens(user):
    c.execute("select * from datasets_p.enwiki_sessions_20150801 where user_text = %s" , (user,))
    sessions = c.fetchall()

    slens = [session_len(sess) for sess in sessions]
    return slens
def moreusers(totalusers):
    k = 0
    min, max = 0, 0
    while(max < totalusers):
        min, max = (k*1000000)+1, (k+1)*1000000
        qstring = "select user_name from enwiki_p.user where user_id  between {} and {} and rand()< 0.0004;".format(min, max)
        yield qstring
        k+=1
import json
jfile = open('/home/paws/user_hours_edits.json','w')
json.dump(user_hours, jfile)
def calculate_edit_hours(user, cursor):
    starttime = datetime.datetime.now()
    qstring = u'''SELECT rev_timestamp FROM enwiki_p.revision_userindex WHERE rev_user_text like "'''+ user + u'''";'''
    cursor.execute(qstring)
    results = cursor.fetchall()
    clean_results = map(lambda t: t[0], results)
    str_clean = [res.decode("utf-8")  for res in clean_results]
    timestamps = [pywikibot.Timestamp.fromtimestampformat(x) for x in str_clean]
    #print len(timestamps)                                                                                                                                     
    edit_sessions = []
    curr_edit_session = []

    prev_timestamp = datetime.datetime(year=2001, month=1, day=1)


    for contrib in timestamps:
        curr_timestamp = contrib

        #if curr_timestamp > snapshot_timestamp:                                                                                                               
        #    break                                                                                                                                             
 
        if curr_timestamp-prev_timestamp < datetime.timedelta(hours=1):
            curr_edit_session.append(curr_timestamp)
            prev_timestamp = curr_timestamp

        else:
            if curr_edit_session:
                edit_sessions.append(curr_edit_session)
            curr_edit_session = [curr_timestamp]
            prev_timestamp = curr_timestamp

    #finally have to add the curr_edit_session to list                                                                                                         
    if curr_edit_session:
        edit_sessions.append(curr_edit_session)


    #print len(edit_sessions)                                                                                                                                  
    def session_length(edit_session):
        avg_time = datetime.timedelta(minutes=7, seconds=10)
        last = edit_session[-1]
        first = edit_session[0]
        span = last - first
        total = span + avg_time
        return total

    session_lengths = map(session_length, edit_sessions)
    second_lens = map(lambda td: td.total_seconds(), session_lengths)
    total_time = sum(second_lens)

    took = datetime.datetime.now() - starttime
    tooksecs = took.total_seconds()
    #print('timestamps per second: ', len(timestamps)/float(tooksecs))
    #returning total hours                                                                                                                                     
    hours = total_time / float(3600)
    editcount = len(timestamps)
    return hours, editcount
@interact
def calc(username=''):
    conn.ping(True)
    calculate_edit_hours(username, conn.cursor())
calculate_edit_hours('Maximilianklein',c)