Initialization

# Imports
import gzip
import json
import re
import csv

import mwapi
import pandas as pd
# Parameters for accessing the API that contains metadata about each translated article
session = mwapi.Session(host='https://en.wikipedia.org',
                        user_agent='mwapi (python) --nisha outreachy applicant content translation')

# articles translated from English -> Hindi
parameters = {'action':'query',
              'format':'json',
              'list':'cxpublishedtranslations',
              'from':'en',
              'to':'hi',
              'limit':500,
              'offset':1500}

Get set of translated articles to dig more deeply into¶

res = session.get(parameters)
len(res['result']['translations'])
500
type(res['result']['translations'])
list
df = pd.DataFrame(res['result']['translations'])
df.to_csv('data.csv', encoding='utf-8')
df.head(10)
publishedDate sourceLanguage sourceRevisionId sourceTitle sourceURL stats targetLanguage targetRevisionId targetTitle targetURL translationId
0 20180515140736 en 841377001 Parashakti //en.wikipedia.org/wiki/Parashakti {'any': 0.70011025358324, 'human': 0.681367144... hi 3797623 Parashakti //hi.wikipedia.org/wiki/पराशक्ति 475284
1 20180515163106 en 828062313 Heavy Engineering Corporation //en.wikipedia.org/wiki/Heavy Engineering Corp... {'any': 0.84667228306655, 'human': 0.688008986... hi 3797741 हेवी इंजिनयरिंग कारपोरेशन //hi.wikipedia.org/wiki/हेवी इंजिनयरिंग कारपोरेशन 475296
2 20180516171722 en 841357718 Parashiva //en.wikipedia.org/wiki/Parashiva {'any': 0.62463556851312, 'human': 0.612244897... hi 3797669 परशिव //hi.wikipedia.org/wiki/परशिव 475301
3 20180517125014 en 841657596 Vishvesh Parmar //en.wikipedia.org/wiki/Vishvesh Parmar {'any': 0.9188589540412, 'human': 0.8973058637... hi 3799149 Vishvesh Parmar //hi.wikipedia.org/wiki/विश्वेश परमार 475992
4 20180520044631 en 842088000 Abraham Ortelius //en.wikipedia.org/wiki/Abraham Ortelius {'any': 0.95182367856446, 'human': 0.182562902... hi 3801543 अब्राहम ओरटेलियस //hi.wikipedia.org/wiki/अब्राहम ओरटेलियस 477273
5 20180520121439 en 839583375 Mohammad Usman //en.wikipedia.org/wiki/Mohammad Usman {'any': 0.93835616438356, 'human': 0.178200283... hi 3801910 मोहम्‍मद उस्मान\n //hi.wikipedia.org/wiki/मोहम्‍मद उस्मान 477468
6 20180523013111 en 840201235 Hajong people //en.wikipedia.org/wiki/Hajong people {'any': 0.25430430842738, 'human': 0.253480517... hi 3804422 हैजोंग लोग //hi.wikipedia.org/wiki/हैजोंग लोग 478712
7 20180523014543 en 827049944 Biete language //en.wikipedia.org/wiki/Biete language {'any': 0.29547471162378, 'human': 0.293700088... hi 3804426 बियाट भाषा //hi.wikipedia.org/wiki/बियाट भाषा 478717
8 20180525025656 en 842523127 Mahatma Gandhi University, Meghalaya //en.wikipedia.org/wiki/Mahatma Gandhi Univers... {'any': 1.0611045828437, 'human': 1.0352526439... hi 3806456 महात्मा गांधी विश्वविद्यालय, मेघालय //hi.wikipedia.org/wiki/महात्मा गांधी विश्वविद... 479721
9 20180525084819 en 841975210 Imarti Devi //en.wikipedia.org/wiki/Imarti Devi {'any': 1.5569007263923, 'human': 1.3765133171... hi 3806833 इमरती देवी //hi.wikipedia.org/wiki/इमरती devi 479812
# I'll use this example for the rest of the notebook
df[df['sourceTitle'] == 'Healthcare in India']
publishedDate sourceLanguage sourceRevisionId sourceTitle sourceURL stats targetLanguage targetRevisionId targetTitle targetURL translationId
309 20190103084650 en 876002613 Healthcare in India //en.wikipedia.org/wiki/Healthcare in India {'any': 0.16867469879518, 'human': 0.120481927... hi 4071123 भारत में स्वास्थ्य देखभाल //hi.wikipedia.org/wiki/भारत में स्वास्थ्य देखभाल 581940
healthIndia_source_title = df.loc[309, 'sourceTitle']
healthIndia_target_title = df.loc[309, 'targetTitle']
healthIndia_tid = df.loc[309,'translationId']
healthIndia_source_revid = df.loc[309, 'sourceRevisionId']
print(healthIndia_source_title)
print(healthIndia_target_title)
print(healthIndia_tid)
print(healthIndia_source_revid)
Healthcare in India
भारत में स्वास्थ्य देखभाल
581940
876002613

Get corresponding parallel translation

Dump Files

# 1st option: Download dump to access all translated articles
json_str = ""
with gzip.open('cx-corpora.en2hi.text.json.gz', 'rt') as fin:
    for line in fin:
        json_str += line.strip()
# remove repetitive commas
json_str = re.sub(',{2,}', ',', json_str)
parallel_corpus = json.loads(json_str)
# Example:
# id: a string composed of <translationID>/<sectionID>
# <translationID> is in the data accessed from the first API
# each section in the article that was translated gets its own <sectionID>
# mt indicates whether machine translation was used in the interface
print("Descriptive statistics:")
print("{0} translated sections.".format(len(parallel_corpus)))
print("{0} translated articles.".format(len(set(sec['id'].split('/')[0] for sec in parallel_corpus))))
mt_counts = {}
for sec in parallel_corpus:
    if sec['mt']:
        service = sec['mt']['engine']
        mt_counts[service] = mt_counts.get(service, 0) + 1
    else:
        mt_counts['no-mt'] = mt_counts.get('no-mt', 0) + 1
print("Machine translation services used: {0}".format(mt_counts))
print("\nExample:\n", parallel_corpus[0])
Descriptive statistics:
12577 translated sections.
1725 translated articles.
Machine translation services used: {'no-mt': 4776, 'Yandex': 6735, 'scratch': 27, 'Google': 1039}

Example:
 {'id': '41161/mwAQ', 'sourceLanguage': 'en', 'targetLanguage': 'hi', 'source': {'content': 'Rajasthan College is a college in Jaipur city in Rajasthan state in India. It is one of five constituent colleges of University of Rajasthan. The college offers undergraduate courses in Arts. It is situated on Jawahar Lal Nehru Road. The college is also known as University Rajasthan College. Vivekanand Hostel a University hostel is associated hostel of the college. It is located at near by the commerce college.'}, 'mt': None, 'target': {'content': 'विश्वविद्यालय राजस्थान महाविद्यालय भारतीय राज्य राजस्थान के जयपुर नगर में स्थित एक महाविद्यालय है। यह राजस्थान विश्वविद्यालय के छः घटक महाविद्यालयों में से एक है। महाविद्यालय में स्नातक स्तर के कला संकाय के पाठ्यक्रमों का अध्ययन करवाया जाता है। यह जवाहरलाल नेहरू मार्ग पर स्थित है।\xa0इस महाविद्यालय से\xa0सम्बद्ध\xa0छात्रावास विवेकानन्द छात्रावास है।'}}
# Translated sections for "Gradient boosting" article
for translated_section in parallel_corpus:
    if translated_section['id'].split('/')[0] == '132474':
        print(translated_section, '\n')
{'id': '132474/mwAQ', 'sourceLanguage': 'en', 'targetLanguage': 'hi', 'source': {'content': 'Inquilab Zindabad (Hindustani: इंक़िलाब ज़िन्दाबाद (Devanagari), اِنقلاب زِنده باد (Nasta\'liq), Punjabi: ਇਨਕਲਾਬ ਜ਼ਿੰਦਾਬਾਦ) is a phrase which translates to "Long Live the Revolution!" Its first use by a revolutionary was in 1929 when Bhagat Singh shouted it after bombing the Central Assembly in Delhi. It became one of the rallying cries of the Indian independence movement. The famous slogan was given by Hasrat Mohani and inspired the activities of the Hindustan Socialist Republican Association particularly Ashfaqulla Khan, Bhagat Singh and Chandrashekhar Azad.The famous slogan of Indian freedom fighters Inquilab Zindabad was coined by Moulana Hasrat Mohani himself.[1] In Indian political novels chronicling the independence movement, a pro-independence sentiment is often characterized by characters shouting this slogan.[2]'}, 'mt': None, 'target': {'content': "इंक़िलाब ज़िन्दाबाद (Hindustani: इंक़िलाब ज़िन्दाबाद (Devanagari), اِنقلاب زِنده باد (Nasta'liq), Punjabi: ਇਨਕਲਾਬ ਜ਼ਿੰਦਾਬਾਦ) एक नारा है जिसे भगत सिंह और उनके क्रांतिकारी साथियों ने दिल्ली की असेंबली में 8 अप्रेल 1929 को एक आवाज़ी बम फोड़ते वक्त बुलंद किया था। यह नारा मशहूर शायर हसरत मोहानी ने एक जलसे में, आज़ादी-ए-कामिल (पूर्ण आज़ादी) की बात करते हुए दिया था।\xa0[1] और इसने हिंदुस्तान सोशलिस्ट रिपब्लिकन एसोसिएशन की गतिविधियों को और विशेष रूप से अशफ़ाक़ुल्लाह ख़ाँ, भगत सिंह और चंद्रशेखर आजाद को प्रेरित किया। स्वतंत्रता आंदोलन के तारीखवार भारतीय राजनीतिक उपन्यासों में, स्वतंत्रता समर्थक भावना अक्सर इस नारे को लगाने वाले पात्रों की विशेषता है।[2]"}} 

{'id': '132474/mwDw', 'sourceLanguage': 'en', 'targetLanguage': 'hi', 'source': {'content': 'References'}, 'mt': None, 'target': {'content': 'संदर्भ'}} 

{'id': '132474/mwEA', 'sourceLanguage': 'en', 'targetLanguage': 'hi', 'source': {'content': '↑ Bhattacherje, S. B. (2009). Encyclopaedia of Indian Events & Dates. Sterling Publishers. p.\xa0A172. ISBN\xa09788120740747.\xa0 ↑ Bhatnagar, O.P. (2007). Indian Political Novel in English. Delhi: Saruk and Sons. p.\xa042. ISBN\xa09788176257992.\xa0'}, 'mt': None, 'target': {'content': '↑ Bhattacherje, S. B. (2009). Encyclopaedia of Indian Events & Dates. Sterling Publishers. p.\xa0A172. ISBN\xa09788120740747.\xa0 ↑ Bhatnagar, O.P. (2007). Indian Political Novel in English. Delhi: Saruk and Sons. p.\xa042. ISBN\xa09788176257992.\xa0'}} 

{'id': '132474/mwcx-source-title', 'sourceLanguage': 'en', 'targetLanguage': 'hi', 'source': {'content': 'Inquilab Zindabad'}, 'mt': None, 'target': {'content': 'इंक़िलाब ज़िन्दाबाद'}} 

# 2nd option: You could also retrieve the "Gradient boosting" sections through the API
# Create a new dictionary of parameters based on the API examples in the link above
# Get the results following the example at the start of the notebook
# 2nd option: You could also retrieve the "Gradient boosting" sections through the API
# articles translated from English -> Spanish
parallel_corpus_parameters = {'action':'query',
                              'format':'json',
                              'list':'contenttranslationcorpora',
                              'translationid':'132474',
                              'striphtml':True}
parallel_corpus_res = session.get(parallel_corpus_parameters)
parallel_corpus_res
{'batchcomplete': '',
 'query': {'contenttranslationcorpora': {'sections': {'mwAQ': {'sequenceid': 1,
     'source': {'engine': None,
      'content': 'Inquilab Zindabad (Hindustani: इंक़िलाब ज़िन्दाबाद (Devanagari), اِنقلاب زِنده باد (Nasta\'liq), Punjabi: ਇਨਕਲਾਬ ਜ਼ਿੰਦਾਬਾਦ) is a phrase which translates to "Long Live the Revolution!" Its first use by a revolutionary was in 1929 when Bhagat Singh shouted it after bombing the Central Assembly in Delhi. It became one of the rallying cries of the Indian independence movement. The famous slogan was given by Hasrat Mohani and inspired the activities of the Hindustan Socialist Republican Association particularly Ashfaqulla Khan, Bhagat Singh and Chandrashekhar Azad.The famous slogan of Indian freedom fighters Inquilab Zindabad was coined by Moulana Hasrat Mohani himself.[1] In Indian political novels chronicling the independence movement, a pro-independence sentiment is often characterized by characters shouting this slogan.[2]',
      'timestamp': '2016-03-24T04:53:51Z'},
     'mt': None,
     'user': {'engine': None,
      'content': "इंक़िलाब ज़िन्दाबाद (Hindustani: इंक़िलाब ज़िन्दाबाद (Devanagari), اِنقلاب زِنده باد (Nasta'liq), Punjabi: ਇਨਕਲਾਬ ਜ਼ਿੰਦਾਬਾਦ) एक नारा है जिसे भगत सिंह और उनके क्रांतिकारी साथियों ने दिल्ली की असेंबली में 8 अप्रेल 1929 को एक आवाज़ी बम फोड़ते वक्त बुलंद किया था। यह नारा मशहूर शायर हसरत मोहानी ने एक जलसे में, आज़ादी-ए-कामिल (पूर्ण आज़ादी) की बात करते हुए दिया था।\xa0[1] और इसने हिंदुस्तान सोशलिस्ट रिपब्लिकन एसोसिएशन की गतिविधियों को और विशेष रूप से अशफ़ाक़ुल्लाह ख़ाँ, भगत सिंह और चंद्रशेखर आजाद को प्रेरित किया। स्वतंत्रता आंदोलन के तारीखवार भारतीय राजनीतिक उपन्यासों में, स्वतंत्रता समर्थक भावना अक्सर इस नारे को लगाने वाले पात्रों की विशेषता है।[2]",
      'timestamp': '2016-03-24T06:06:28Z'}},
    'mwDw': {'sequenceid': 17,
     'source': {'engine': None,
      'content': 'References',
      'timestamp': '2016-03-24T04:53:51Z'},
     'mt': None,
     'user': {'engine': None,
      'content': 'संदर्भ',
      'timestamp': '2016-03-24T06:06:38Z'}},
    'mwEA': {'sequenceid': 19,
     'source': {'engine': None,
      'content': '↑ Bhattacherje, S. B. (2009). Encyclopaedia of Indian Events & Dates. Sterling Publishers. p.\xa0A172. ISBN\xa09788120740747.\xa0 ↑ Bhatnagar, O.P. (2007). Indian Political Novel in English. Delhi: Saruk and Sons. p.\xa042. ISBN\xa09788176257992.\xa0',
      'timestamp': '2016-03-24T04:53:51Z'},
     'mt': None,
     'user': {'engine': None,
      'content': '↑ Bhattacherje, S. B. (2009). Encyclopaedia of Indian Events & Dates. Sterling Publishers. p.\xa0A172. ISBN\xa09788120740747.\xa0 ↑ Bhatnagar, O.P. (2007). Indian Political Novel in English. Delhi: Saruk and Sons. p.\xa042. ISBN\xa09788176257992.\xa0',
      'timestamp': '2016-03-24T04:53:51Z'}},
    'mwcx-source-title': {'sequenceid': 0,
     'source': {'engine': None,
      'content': 'Inquilab Zindabad',
      'timestamp': '2016-03-24T04:54:01Z'},
     'mt': None,
     'user': {'engine': None,
      'content': 'इंक़िलाब ज़िन्दाबाद',
      'timestamp': '2016-03-24T04:54:05Z'}}}}}}

Analyses

Qualitative

Quantitative Analyses

session = mwapi.Session(host='https://hi.wikipedia.org',
                        user_agent='mwapi (python) --nisha outreachy applicant content translation')

# Page revision history example
revision_parameters = {
    "action": "query",
    "prop": "revisions",
   # "titles": 'भारत में स्वास्थ्य देखभाल',
    "rvprop": "timestamp|user|comment",
    "rvlimit": 100,
    ""
    "rvslots": "main",
    "formatversion": "2",
    "format": "json",
    "rvdir": "newer",
    "redirects":True
}
healthIndia_revisions = session.get(revision_parameters)
healthIndia_revisions
{'batchcomplete': True,
 'query': {'pages': [{'pageid': 997799,
    'ns': 0,
    'title': 'भारत में स्वास्थ्य देखभाल',
    'revisions': [{'user': 'Nilesh shukla',
      'timestamp': '2019-01-03T08:46:50Z',
      'comment': '"[[:en:Special:Redirect/revision/876002613|Healthcare in India]]" पृष्ठ का अनुवाद करके निर्मित किया गया'},
     {'user': 'Nilesh shukla',
      'timestamp': '2019-01-03T08:50:39Z',
      'comment': 'आंशिक सुधार किया।'},
     {'user': 'Nilesh shukla',
      'timestamp': '2019-01-03T09:26:28Z',
      'comment': 'विस्तार किया।'},
     {'user': 'Nilesh shukla',
      'timestamp': '2019-01-03T09:28:13Z',
      'comment': 'आंशिक सुधार किया।'}]}]}}

The previous data chosen didn't have much Hindi history.

df.tail(50)

Qualitative Analysis

  • What type of article sections are converted as-is? (i.e humans do not modify the machine translated versions)

113 - Sports Equipment 211- Indira Ranamagar

  • What sections are changed substantially?
  • Most viewed aricles
!pip install git+https://github.com/mediawiki-utilities/python-mwviews.git
import mwviews
from mwviews.api import PageviewsClient
p = PageviewsClient(user_agent='mwviews (python) -- outreachy content translation')
p.top_articles('hi.wikipedia', limit=50)
 

Quantitative Analysis