import gzip
import json
import re

import mwapi
import pandas as pd
session = mwapi.Session(host='https://en.wikipedia.org', user_agent = 'mwapi (python) -- outreachy content translation')
parameters = {'action':'query',
              'format':'json',
              'list':'cxpublishedtranslations',
              'from':'en',
              'to':'es',
              'limit':500,
              'offset':20000}
res = session.get(parameters)
res['result']['translations'][:10]
[{'translationId': '374559',
  'sourceTitle': 'Los Americans',
  'targetTitle': 'Usuario:Ivonnejo04/Los Americans',
  'sourceLanguage': 'en',
  'sourceRevisionId': '801947468',
  'targetRevisionId': '102209427',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/Los Americans',
  'targetURL': '//es.wikipedia.org/wiki/Los Americans',
  'publishedDate': '20170928184257',
  'stats': {'any': 0.94256474519632,
   'human': 0.029866332497911,
   'mt': 0.91269841269841,
   'mtSectionsCount': 11}},
 {'translationId': '374655',
  'sourceTitle': 'American Vandal',
  'targetTitle': 'Usuario:Misaelflorezav/American Vandal',
  'sourceLanguage': 'en',
  'sourceRevisionId': '802832925',
  'targetRevisionId': '102214615',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/American Vandal',
  'targetURL': '//es.wikipedia.org/wiki/Usuario:Misaelflorezav/American Vandal',
  'publishedDate': '20170928215416',
  'stats': {'any': 0.21890547263682,
   'human': 0.084920226453937,
   'mt': 0.13398524618288,
   'mtSectionsCount': 2}},
 {'translationId': '374660',
  'sourceTitle': 'End Records',
  'targetTitle': 'End Records',
  'sourceLanguage': 'en',
  'sourceRevisionId': '746876472',
  'targetRevisionId': '102215116',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/End Records',
  'targetURL': '//es.wikipedia.org/wiki/End Records',
  'publishedDate': '20170928221445',
  'stats': {'any': 1.1047708138447,
   'human': 1.1047708138447,
   'mt': 0,
   'mtSectionsCount': 0}},
 {'translationId': '374661',
  'sourceTitle': '60S ribosomal protein L10',
  'targetTitle': 'Proteína L10 de la unidad ribosómica 60S\xa0',
  'sourceLanguage': 'en',
  'sourceRevisionId': '797871239',
  'targetRevisionId': '102215432',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/60S ribosomal protein L10',
  'targetURL': '//es.wikipedia.org/wiki/Proteína L10 de la unidad ribosómica 60S\xa0',
  'publishedDate': '20170928223045',
  'stats': {'any': 0.27921220127297,
   'human': 0.27476882430647,
   'mt': 0.0044433769664945,
   'mtSectionsCount': 3}},
 {'translationId': '374663',
  'sourceTitle': 'Ned Washington',
  'targetTitle': 'Ned Washington',
  'sourceLanguage': 'en',
  'sourceRevisionId': '796534066',
  'targetRevisionId': '102215375',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/Ned Washington',
  'targetURL': '//es.wikipedia.org/wiki/Ned Washington',
  'publishedDate': '20170928222819',
  'stats': {'any': 1.0329474621549,
   'human': 1.0255268625705,
   'mt': 0.0074205995844464,
   'mtSectionsCount': 2}},
 {'translationId': '374674',
  'sourceTitle': 'Greek legislative election, 1936',
  'targetTitle': 'Elecciones parlamentarias de Grecia de 1936',
  'sourceLanguage': 'en',
  'sourceRevisionId': '797738429',
  'targetRevisionId': '102216616',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/Greek legislative election, 1936',
  'targetURL': '//es.wikipedia.org/wiki/Elecciones parlamentarias de Grecia de 1936',
  'publishedDate': '20170928233753',
  'stats': {'any': 0.74281709880869,
   'human': 0.7281009110021,
   'mt': 0.014716187806587,
   'mtSectionsCount': 2}},
 {'translationId': '374675',
  'sourceTitle': 'Maltese general election, 1927',
  'targetTitle': 'Elecciones generales de Malta de 1927',
  'sourceLanguage': 'en',
  'sourceRevisionId': '787160410',
  'targetRevisionId': '102216883',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/Maltese general election, 1927',
  'targetURL': '//es.wikipedia.org/wiki/Elecciones generales de Malta de 1927',
  'publishedDate': '20170928235113',
  'stats': {'any': 0.94422310756972,
   'human': 0.89322709163347,
   'mt': 0.050996015936255,
   'mtSectionsCount': 5}},
 {'translationId': '374719',
  'sourceTitle': 'Dançando',
  'targetTitle': 'Dançando',
  'sourceLanguage': 'en',
  'sourceRevisionId': '801281364',
  'targetRevisionId': '102222244',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/Dançando',
  'targetURL': '//es.wikipedia.org/wiki/Dançando',
  'publishedDate': '20170929060104',
  'stats': {'any': 0.86393027832443,
   'human': 0.85704245150408,
   'mt': 0.0068878268203542,
   'mtSectionsCount': 4}},
 {'translationId': '374751',
  'sourceTitle': 'BFR (rocket)',
  'targetTitle': 'BFR (cohete)',
  'sourceLanguage': 'en',
  'sourceRevisionId': '802912035',
  'targetRevisionId': '102223820',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/BFR (rocket)',
  'targetURL': '//es.wikipedia.org/wiki/BFR (cohete)',
  'publishedDate': '20170929080018',
  'stats': {'any': 0.014279624893436,
   'human': 0.00078147200909349,
   'mt': 0.013498152884342,
   'mtSectionsCount': 4}},
 {'translationId': '374858',
  'sourceTitle': 'MatrixNet',
  'targetTitle': 'MatrixNet',
  'sourceLanguage': 'en',
  'sourceRevisionId': '774931364',
  'targetRevisionId': '102229789',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/MatrixNet',
  'targetURL': '//es.wikipedia.org/wiki/MatrixNet',
  'publishedDate': '20170929150431',
  'stats': {'any': 0.61847988077496,
   'human': 0.55290611028316,
   'mt': 0.065573770491803,
   'mtSectionsCount': 4}}]
df = pd.DataFrame(res['result']['translations'])
df.head(10)
publishedDate sourceLanguage sourceRevisionId sourceTitle sourceURL stats targetLanguage targetRevisionId targetTitle targetURL translationId
0 20170928184257 en 801947468 Los Americans //en.wikipedia.org/wiki/Los Americans {'any': 0.94256474519632, 'human': 0.029866332... es 102209427 Usuario:Ivonnejo04/Los Americans //es.wikipedia.org/wiki/Los Americans 374559
1 20170928215416 en 802832925 American Vandal //en.wikipedia.org/wiki/American Vandal {'any': 0.21890547263682, 'human': 0.084920226... es 102214615 Usuario:Misaelflorezav/American Vandal //es.wikipedia.org/wiki/Usuario:Misaelflorezav... 374655
2 20170928221445 en 746876472 End Records //en.wikipedia.org/wiki/End Records {'any': 1.1047708138447, 'human': 1.1047708138... es 102215116 End Records //es.wikipedia.org/wiki/End Records 374660
3 20170928223045 en 797871239 60S ribosomal protein L10 //en.wikipedia.org/wiki/60S ribosomal protein L10 {'any': 0.27921220127297, 'human': 0.274768824... es 102215432 Proteína L10 de la unidad ribosómica 60S //es.wikipedia.org/wiki/Proteína L10 de la uni... 374661
4 20170928222819 en 796534066 Ned Washington //en.wikipedia.org/wiki/Ned Washington {'any': 1.0329474621549, 'human': 1.0255268625... es 102215375 Ned Washington //es.wikipedia.org/wiki/Ned Washington 374663
5 20170928233753 en 797738429 Greek legislative election, 1936 //en.wikipedia.org/wiki/Greek legislative elec... {'any': 0.74281709880869, 'human': 0.728100911... es 102216616 Elecciones parlamentarias de Grecia de 1936 //es.wikipedia.org/wiki/Elecciones parlamentar... 374674
6 20170928235113 en 787160410 Maltese general election, 1927 //en.wikipedia.org/wiki/Maltese general electi... {'any': 0.94422310756972, 'human': 0.893227091... es 102216883 Elecciones generales de Malta de 1927 //es.wikipedia.org/wiki/Elecciones generales d... 374675
7 20170929060104 en 801281364 Dançando //en.wikipedia.org/wiki/Dançando {'any': 0.86393027832443, 'human': 0.857042451... es 102222244 Dançando //es.wikipedia.org/wiki/Dançando 374719
8 20170929080018 en 802912035 BFR (rocket) //en.wikipedia.org/wiki/BFR (rocket) {'any': 0.014279624893436, 'human': 0.00078147... es 102223820 BFR (cohete) //es.wikipedia.org/wiki/BFR (cohete) 374751
9 20170929150431 en 774931364 MatrixNet //en.wikipedia.org/wiki/MatrixNet {'any': 0.61847988077496, 'human': 0.552906110... es 102229789 MatrixNet //es.wikipedia.org/wiki/MatrixNet 374858
gboost_source_title = df.loc[8, 'sourceTitle']
gboost_target_title = df.loc[8, 'targetTitle']
gboost_tid = df.loc[8,'translationId']
gboost_source_revid = df.loc[8, 'sourceRevisionId']
print(gboost_source_title)
print(gboost_target_title)
print(gboost_tid)
print(gboost_source_revid)
BFR (rocket)
BFR (cohete)
374751
802912035
json_str = ""
with gzip.open('cx-corpora.en2es.text.json.gz', 'rt') as fin:
    for line in fin:
        json_str += line.strip()