# Imports
import os
import gzip
import json
import re
import requests
from bs4 import BeautifulSoup as BS

import mwapi
import pandas as pd
# Parameters for accessing the API that contains metadata about each translated article
session = mwapi.Session(host='https://en.wikipedia.org',
                        user_agent='mwapi (python) -- outreachy content translation')

# articles translated from English -> Spanish
parameters = {'action':'query',
              'format':'json',
              'list':'cxpublishedtranslations',
              'from':'en',
              'to':'hi',
              'limit':100,
              'offset':20000}
res = session.get(parameters)
res['result']['translations'][:10]
[]
df = pd.DataFrame(res['result']['translations'])
df.head(10)
# I'll use this example for the rest of the notebook
df[df['sourceTitle'] == 'Gradient boosting']
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/srv/paws/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3063             try:
-> 3064                 return self._engine.get_loc(key)
   3065             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'sourceTitle'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-19-5cc73fb91e9b> in <module>()
      1 # I'll use this example for the rest of the notebook
----> 2 df[df['sourceTitle'] == 'Gradient boosting']

/srv/paws/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2686             return self._getitem_multilevel(key)
   2687         else:
-> 2688             return self._getitem_column(key)
   2689 
   2690     def _getitem_column(self, key):

/srv/paws/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2693         # get column
   2694         if self.columns.is_unique:
-> 2695             return self._get_item_cache(key)
   2696 
   2697         # duplicate columns & possible reduce dimensionality

/srv/paws/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   2484         res = cache.get(item)
   2485         if res is None:
-> 2486             values = self._data.get(item)
   2487             res = self._box_item_values(item, values)
   2488             cache[item] = res

/srv/paws/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   4113 
   4114             if not isna(item):
-> 4115                 loc = self.items.get_loc(item)
   4116             else:
   4117                 indexer = np.arange(len(self.items))[isna(self.items)]

/srv/paws/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3064                 return self._engine.get_loc(key)
   3065             except KeyError:
-> 3066                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   3067 
   3068         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'sourceTitle'
gboost_source_title = df.loc[8, 'sourceTitle']
gboost_target_title = df.loc[8, 'targetTitle']
gboost_tid = df.loc[8,'translationId']
gboost_source_revid = df.loc[8, 'sourceRevisionId']
print(gboost_source_title)
print(gboost_target_title)
print(gboost_tid)
print(gboost_source_revid)
BFR (rocket)
BFR (cohete)
374751
802912035
# 1st option: Download dump to access all translated articles
json_str = ""

with gzip.open('cx-corpora.en2hi.text.json.gz', 'rt') as fin:
    for line in fin:
        json_str += line.strip()
# remove repetitive commas
json_str = re.sub(',{2,}', ',', json_str)
parallel_corpus = json.loads(json_str)
# Example:
# id: a string composed of <translationID>/<sectionID>
# <translationID> is in the data accessed from the first API
# each section in the article that was translated gets its own <sectionID>
# mt indicates whether machine translation was used in the interface
print("Descriptive statistics:")
print("{0} translated sections.".format(len(parallel_corpus)))
print("{0} translated articles.".format(len(set(sec['id'].split('/')[0] for sec in parallel_corpus))))
mt_counts = {}
for sec in parallel_corpus:
    if sec['mt']:
        service = sec['mt']['engine']
        mt_counts[service] = mt_counts.get(service, 0) + 1
    else:
        mt_counts['no-mt'] = mt_counts.get('no-mt', 0) + 1
        
print("Machine translation services used: {0}".format(mt_counts))
print("\nExample:\n", parallel_corpus[0])
Descriptive statistics:
12577 translated sections.
1725 translated articles.
Machine translation services used: {'no-mt': 4776, 'Yandex': 6735, 'scratch': 27, 'Google': 1039}

Example:
 {'id': '41161/mwAQ', 'sourceLanguage': 'en', 'targetLanguage': 'hi', 'source': {'content': 'Rajasthan College is a college in Jaipur city in Rajasthan state in India. It is one of five constituent colleges of University of Rajasthan. The college offers undergraduate courses in Arts. It is situated on Jawahar Lal Nehru Road. The college is also known as University Rajasthan College. Vivekanand Hostel a University hostel is associated hostel of the college. It is located at near by the commerce college.'}, 'mt': None, 'target': {'content': 'विश्वविद्यालय राजस्थान महाविद्यालय भारतीय राज्य राजस्थान के जयपुर नगर में स्थित एक महाविद्यालय है। यह राजस्थान विश्वविद्यालय के छः घटक महाविद्यालयों में से एक है। महाविद्यालय में स्नातक स्तर के कला संकाय के पाठ्यक्रमों का अध्ययन करवाया जाता है। यह जवाहरलाल नेहरू मार्ग पर स्थित है।\xa0इस महाविद्यालय से\xa0सम्बद्ध\xa0छात्रावास विवेकानन्द छात्रावास है।'}}