# Imports
import gzip
import json
import re

import mwapi
import pandas as pd

Some notes about the cxpublishedtranslations API:

Example and overview given here: https://www.mediawiki.org/wiki/Content_translation/Published_translations#List_of_published_source_and_target_titles In the parameters, you may notice there is an 'offset'. The API will only return 500 results at a time and given that there are often more than 500 articles that were translated, this parameter tells the API which results you want. * For the cxpublishedtranslations API, the articles are sorted by the date they were translated, so the deeper into the dataset you go (the higher the offset), the newer the translations are. We do not have full data for articles that were translated before 2016-01-22, so I chose an offset (20000) that would get me a list of articles translated after that date. You may have to try a few offsets to find one that works. If the results are empty, that probably means your offset is too high.

# Parameters for accessing the API that contains metadata about each translated article
session = mwapi.Session(host='https://en.wikipedia.org',
                        user_agent='mwapi (python) -- outreachy content translation')

# articles translated from English -> Spanish
parameters = {'action':'query',
              'format':'json',
              'list':'cxpublishedtranslations',
              'from':'en',
              'to':'es',
              'limit':500,
              'offset':20000}

Get set of translated articles to dig more deeply into

res = session.get(parameters)
res['result']['translations'][:10]
[{'translationId': '374655',
  'sourceTitle': 'American Vandal',
  'targetTitle': 'Usuario:Misaelflorezav/American Vandal',
  'sourceLanguage': 'en',
  'sourceRevisionId': '802832925',
  'targetRevisionId': '102214615',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/American Vandal',
  'targetURL': '//es.wikipedia.org/wiki/Usuario:Misaelflorezav/American Vandal',
  'publishedDate': '20170928215416',
  'stats': {'any': 0.21890547263682,
   'human': 0.084920226453937,
   'mt': 0.13398524618288,
   'mtSectionsCount': 2}},
 {'translationId': '374660',
  'sourceTitle': 'End Records',
  'targetTitle': 'End Records',
  'sourceLanguage': 'en',
  'sourceRevisionId': '746876472',
  'targetRevisionId': '102215116',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/End Records',
  'targetURL': '//es.wikipedia.org/wiki/End Records',
  'publishedDate': '20170928221445',
  'stats': {'any': 1.1047708138447,
   'human': 1.1047708138447,
   'mt': 0,
   'mtSectionsCount': 0}},
 {'translationId': '374661',
  'sourceTitle': '60S ribosomal protein L10',
  'targetTitle': 'Proteína L10 de la unidad ribosómica 60S\xa0',
  'sourceLanguage': 'en',
  'sourceRevisionId': '797871239',
  'targetRevisionId': '102215432',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/60S ribosomal protein L10',
  'targetURL': '//es.wikipedia.org/wiki/Proteína L10 de la unidad ribosómica 60S\xa0',
  'publishedDate': '20170928223045',
  'stats': {'any': 0.27921220127297,
   'human': 0.27476882430647,
   'mt': 0.0044433769664945,
   'mtSectionsCount': 3}},
 {'translationId': '374663',
  'sourceTitle': 'Ned Washington',
  'targetTitle': 'Ned Washington',
  'sourceLanguage': 'en',
  'sourceRevisionId': '796534066',
  'targetRevisionId': '102215375',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/Ned Washington',
  'targetURL': '//es.wikipedia.org/wiki/Ned Washington',
  'publishedDate': '20170928222819',
  'stats': {'any': 1.0329474621549,
   'human': 1.0255268625705,
   'mt': 0.0074205995844464,
   'mtSectionsCount': 2}},
 {'translationId': '374674',
  'sourceTitle': 'Greek legislative election, 1936',
  'targetTitle': 'Elecciones parlamentarias de Grecia de 1936',
  'sourceLanguage': 'en',
  'sourceRevisionId': '797738429',
  'targetRevisionId': '102216616',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/Greek legislative election, 1936',
  'targetURL': '//es.wikipedia.org/wiki/Elecciones parlamentarias de Grecia de 1936',
  'publishedDate': '20170928233753',
  'stats': {'any': 0.74281709880869,
   'human': 0.7281009110021,
   'mt': 0.014716187806587,
   'mtSectionsCount': 2}},
 {'translationId': '374675',
  'sourceTitle': 'Maltese general election, 1927',
  'targetTitle': 'Elecciones generales de Malta de 1927',
  'sourceLanguage': 'en',
  'sourceRevisionId': '787160410',
  'targetRevisionId': '102216883',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/Maltese general election, 1927',
  'targetURL': '//es.wikipedia.org/wiki/Elecciones generales de Malta de 1927',
  'publishedDate': '20170928235113',
  'stats': {'any': 0.94422310756972,
   'human': 0.89322709163347,
   'mt': 0.050996015936255,
   'mtSectionsCount': 5}},
 {'translationId': '374719',
  'sourceTitle': 'Dançando',
  'targetTitle': 'Dançando',
  'sourceLanguage': 'en',
  'sourceRevisionId': '801281364',
  'targetRevisionId': '102222244',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/Dançando',
  'targetURL': '//es.wikipedia.org/wiki/Dançando',
  'publishedDate': '20170929060104',
  'stats': {'any': 0.86393027832443,
   'human': 0.85704245150408,
   'mt': 0.0068878268203542,
   'mtSectionsCount': 4}},
 {'translationId': '374751',
  'sourceTitle': 'BFR (rocket)',
  'targetTitle': 'BFR (cohete)',
  'sourceLanguage': 'en',
  'sourceRevisionId': '802912035',
  'targetRevisionId': '102223820',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/BFR (rocket)',
  'targetURL': '//es.wikipedia.org/wiki/BFR (cohete)',
  'publishedDate': '20170929080018',
  'stats': {'any': 0.014279624893436,
   'human': 0.00078147200909349,
   'mt': 0.013498152884342,
   'mtSectionsCount': 4}},
 {'translationId': '374858',
  'sourceTitle': 'MatrixNet',
  'targetTitle': 'MatrixNet',
  'sourceLanguage': 'en',
  'sourceRevisionId': '774931364',
  'targetRevisionId': '102229789',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/MatrixNet',
  'targetURL': '//es.wikipedia.org/wiki/MatrixNet',
  'publishedDate': '20170929150431',
  'stats': {'any': 0.61847988077496,
   'human': 0.55290611028316,
   'mt': 0.065573770491803,
   'mtSectionsCount': 4}},
 {'translationId': '374869',
  'sourceTitle': 'Gradient boosting',
  'targetTitle': 'Gradient boosting',
  'sourceLanguage': 'en',
  'sourceRevisionId': '801498395',
  'targetRevisionId': '102230661',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/Gradient boosting',
  'targetURL': '//es.wikipedia.org/wiki/Gradient boosting',
  'publishedDate': '20180524022255',
  'stats': {'any': 0.050171256336484,
   'human': 0.048417591450884,
   'mt': 0.0017536648856008,
   'mtSectionsCount': 3}}]

Alternative view of the data via Pandas

df = pd.DataFrame(res['result']['translations'])
df.head(10)
publishedDate sourceLanguage sourceRevisionId sourceTitle sourceURL stats targetLanguage targetRevisionId targetTitle targetURL translationId
0 20170928215416 en 802832925 American Vandal //en.wikipedia.org/wiki/American Vandal {'any': 0.21890547263682, 'human': 0.084920226... es 102214615 Usuario:Misaelflorezav/American Vandal //es.wikipedia.org/wiki/Usuario:Misaelflorezav... 374655
1 20170928221445 en 746876472 End Records //en.wikipedia.org/wiki/End Records {'any': 1.1047708138447, 'human': 1.1047708138... es 102215116 End Records //es.wikipedia.org/wiki/End Records 374660
2 20170928223045 en 797871239 60S ribosomal protein L10 //en.wikipedia.org/wiki/60S ribosomal protein L10 {'any': 0.27921220127297, 'human': 0.274768824... es 102215432 Proteína L10 de la unidad ribosómica 60S //es.wikipedia.org/wiki/Proteína L10 de la uni... 374661
3 20170928222819 en 796534066 Ned Washington //en.wikipedia.org/wiki/Ned Washington {'any': 1.0329474621549, 'human': 1.0255268625... es 102215375 Ned Washington //es.wikipedia.org/wiki/Ned Washington 374663
4 20170928233753 en 797738429 Greek legislative election, 1936 //en.wikipedia.org/wiki/Greek legislative elec... {'any': 0.74281709880869, 'human': 0.728100911... es 102216616 Elecciones parlamentarias de Grecia de 1936 //es.wikipedia.org/wiki/Elecciones parlamentar... 374674
5 20170928235113 en 787160410 Maltese general election, 1927 //en.wikipedia.org/wiki/Maltese general electi... {'any': 0.94422310756972, 'human': 0.893227091... es 102216883 Elecciones generales de Malta de 1927 //es.wikipedia.org/wiki/Elecciones generales d... 374675
6 20170929060104 en 801281364 Dançando //en.wikipedia.org/wiki/Dançando {'any': 0.86393027832443, 'human': 0.857042451... es 102222244 Dançando //es.wikipedia.org/wiki/Dançando 374719
7 20170929080018 en 802912035 BFR (rocket) //en.wikipedia.org/wiki/BFR (rocket) {'any': 0.014279624893436, 'human': 0.00078147... es 102223820 BFR (cohete) //es.wikipedia.org/wiki/BFR (cohete) 374751
8 20170929150431 en 774931364 MatrixNet //en.wikipedia.org/wiki/MatrixNet {'any': 0.61847988077496, 'human': 0.552906110... es 102229789 MatrixNet //es.wikipedia.org/wiki/MatrixNet 374858
9 20180524022255 en 801498395 Gradient boosting //en.wikipedia.org/wiki/Gradient boosting {'any': 0.050171256336484, 'human': 0.04841759... es 102230661 Gradient boosting //es.wikipedia.org/wiki/Gradient boosting 374869
# I'll use this example for the rest of the notebook
df[df['sourceTitle'] == 'Gradient boosting']
publishedDate sourceLanguage sourceRevisionId sourceTitle sourceURL stats targetLanguage targetRevisionId targetTitle targetURL translationId
9 20180524022255 en 801498395 Gradient boosting //en.wikipedia.org/wiki/Gradient boosting {'any': 0.050171256336484, 'human': 0.04841759... es 102230661 Gradient boosting //es.wikipedia.org/wiki/Gradient boosting 374869
gboost_source_title = df.loc[8, 'sourceTitle']
gboost_target_title = df.loc[8, 'targetTitle']
gboost_tid = df.loc[8,'translationId']
gboost_source_revid = df.loc[8, 'sourceRevisionId']
print(gboost_source_title)
print(gboost_target_title)
print(gboost_tid)
print(gboost_source_revid)
MatrixNet
MatrixNet
374858
774931364

Get corresponding parallel translation

Parallel translations can either be accessed through the dump files or API. Use the dump files if you are planning on analyzing the entire corpus (or a large proportion) of translated articles. The API is best for looking at a few examples.

Dump files

the dump files give you local access to the parallel translations and are the "friendly" way to access the data, especially if you are looking at a lot of examples. download most recent .text.json.gz file based on instructions here: https://www.mediawiki.org/wiki/Content_translation/Published_translations#Dumps upload dump file to PAWS (there is an upload button if you go to your dashboard (click on the PAWS logo in top right) there is a bug where the dump files have extra commas that break the JSON schema and leads to an error if you call json.load directly on the fin variable. Instead, you have to remove them as below to load in the dump file.

API

this is the quickest way to access the parallel translations, but it is best for looking at just a few examples see overview: https://www.mediawiki.org/wiki/Content_translation/Published_translations#API

# 1st option: Download dump to access all translated articles
json_str = ""
with gzip.open('cx-corpora.en2es.text.json.gz', 'rt') as fin:
    for line in fin:
        json_str += line.strip()
# remove repetitive commas
json_str = re.sub(',{2,}', ',', json_str)
parallel_corpus = json.loads(json_str)
---------------------------------------------------------------------------
JSONDecodeError                           Traceback (most recent call last)
<ipython-input-10-2898b93ed3a6> in <module>()
      6 # remove repetitive commas
      7 json_str = re.sub(',{2,}', ',', json_str)
----> 8 parallel_corpus = json.loads(json_str)

/usr/lib/python3.6/json/__init__.py in loads(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    352             parse_int is None and parse_float is None and
    353             parse_constant is None and object_pairs_hook is None and not kw):
--> 354         return _default_decoder.decode(s)
    355     if cls is None:
    356         cls = JSONDecoder

/usr/lib/python3.6/json/decoder.py in decode(self, s, _w)
    337 
    338         """
--> 339         obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    340         end = _w(s, end).end()
    341         if end != len(s):

/usr/lib/python3.6/json/decoder.py in raw_decode(self, s, idx)
    355             obj, end = self.scan_once(s, idx)
    356         except StopIteration as err:
--> 357             raise JSONDecodeError("Expecting value", s, err.value) from None
    358         return obj, end

JSONDecodeError: Expecting value: line 1 column 356418517 (char 356418516)
# Example:
# id: a string composed of <translationID>/<sectionID>
# <translationID> is in the data accessed from the first API
# each section in the article that was translated gets its own <sectionID>
# mt indicates whether machine translation was used in the interface
print("Descriptive statistics:")
print("{0} translated sections.".format(len(parallel_corpus)))
print("{0} translated articles.".format(len(set(sec['id'].split('/')[0] for sec in parallel_corpus))))
mt_counts = {}
for sec in parallel_corpus:
    if sec['mt']:
        service = sec['mt']['engine']
        mt_counts[service] = mt_counts.get(service, 0) + 1
    else:
        mt_counts['no-mt'] = mt_counts.get('no-mt', 0) + 1
print("Machine translation services used: {0}".format(mt_counts))
print("\nExample:\n", parallel_corpus[0])
Descriptive statistics:
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-dbd03e64eee0> in <module>()
      5 # mt indicates whether machine translation was used in the interface
      6 print("Descriptive statistics:")
----> 7 print("{0} translated sections.".format(len(parallel_corpus)))
      8 print("{0} translated articles.".format(len(set(sec['id'].split('/')[0] for sec in parallel_corpus))))
      9 mt_counts = {}

NameError: name 'parallel_corpus' is not defined