# Imports
import gzip
import json
import re

import mwapi
import pandas as pd
# Parameters for accessing the API that contains metadata about each translated article
session = mwapi.Session(host='https://en.wikipedia.org',
                        user_agent='mwapi (python) -- outreachy content translation')

# articles translated from English -> Arabic
parameters = {'action':'query',
              'format':'json',
              'list':'cxpublishedtranslations',
              'from':'en',
              'to':'ar',
              'limit':500,
              'offset':10000}
res = session.get(parameters)
res['result']['translations'][:10]
df = pd.DataFrame(res['result']['translations'])
df.head(10)

From the data above I can tell some points:

  • The translation for a titles for some articles here does not give the true meaning for the articles such as: . khash Country . Altab Ali

But when we get the Arabic translation on wiki we get the correct title, I think that is because the literal translation is not usefull here in the Arabic Translation we should translate by the meaning.

  • In all of the articles the human translations used more than the machine translation and in some article when we use just the machine translation we get the general description for the article not all of it when translated to Arabic language .

  • More than 6 articles have poor stats statics which till us the Arabic translations for these articles is not that much.

df[df['sourceTitle'] == 'United Arab Emirates takeover of Socotra']

Ther is two published Date here for the same article I think because of the modify of the original article and there is more human translation in the article modified.

df[df['sourceTitle'] == 'Boeing Model 203']

The human tranlasion for some articles are very low such as Boeing Model 203 article. The Arabic translation for this article is for the general description only.

# Page revision history for Boeing Model 203
gboost_source_title = df.loc[5, 'sourceTitle']
gboost_target_title = df.loc[5, 'targetTitle']
gboost_tid = df.loc[5,'translationId']
gboost_source_revid = df.loc[5, 'sourceRevisionId']

revision_parameters = {
    "action": "query",
    "prop": "revisions",
    "titles": gboost_source_title,
    "rvprop": "timestamp|user|comment",
    "rvlimit": 100,
    "rvstartid": gboost_source_revid,
    "rvslots": "main",
    "formatversion": "2",
    "format": "json",
    "rvdir": "newer"
}
gboost_revisions = session.get(revision_parameters)
gboost_revisions
df[df['sourceTitle'] == 'Altab Ali']

Also in this article which named Altab Ali (this is not the title for this article) and the article talk about the British Bangladeshi and Altab Ali one of the character in this article and the human tranlasion is very low. The Arabic translation for this article is for the general description only.

gboost_source_title = df.loc[7, 'sourceTitle']
gboost_target_title = df.loc[7, 'targetTitle']
gboost_tid = df.loc[7,'translationId']
gboost_source_revid = df.loc[7, 'sourceRevisionId']
print(gboost_source_title)
print(gboost_target_title)
print(gboost_tid)
print(gboost_source_revid)
json_str = ""
with gzip.open('cx-corpora.en2ar.text.json.gz', 'rt') as fin:
    for line in fin:
        json_str += line.strip()
# remove repetitive commas
json_str = re.sub(',{2,}', ',', json_str)
parallel_corpus = json.loads(json_str)
# id: a string composed of <translationID>/<sectionID>
# <translationID> is in the data accessed from the first API
# each section in the article that was translated gets its own <sectionID>
# mt indicates whether machine translation was used in the interface
print("Descriptive statistics:")
print("{0} translated sections.".format(len(parallel_corpus)))
print("{0} translated articles.".format(len(set(sec['id'].split('/')[0] for sec in parallel_corpus))))
mt_counts = {}
for sec in parallel_corpus:
    if sec['mt']:
        service = sec['mt']['engine']
        mt_counts[service] = mt_counts.get(service, 0) + 1
    else:
        mt_counts['no-mt'] = mt_counts.get('no-mt', 0) + 1
print("Machine translation services used: {0}".format(mt_counts))
print("\nExample:\n", parallel_corpus[0])
# Translated sections for "Altab Ali" article
for translated_section in parallel_corpus:
    if translated_section['id'].split('/')[0] == gboost_tid:
        print(translated_section, '\n')

The Arabic language sequence is weak in some sections here, although when I came back to the original article on wikipedia the sequence is correct there. I don't know why when we get the barrallel translation of the code here the sequence is weak.

# Page revision history for Altab Ali
revision_parameters = {
    "action": "query",
    "prop": "revisions",
    "titles": gboost_source_title,
    "rvprop": "timestamp|user|comment",
    "rvlimit": 100,
    "rvstartid": gboost_source_revid,
    "rvslots": "main",
    "formatversion": "2",
    "format": "json",
    "rvdir": "newer"
}
gboost_revisions = session.get(revision_parameters)
gboost_revisions

7 users make a modifying on Altab Ali article since 1\5\2018 to 25\8\2018 on Death date information, Legacy, and Further content & sources and all of the modification on the english language article version.

gboost_source_title = df.loc[8, 'sourceTitle']
gboost_target_title = df.loc[8, 'targetTitle']
gboost_tid = df.loc[8,'translationId']
gboost_source_revid = df.loc[8, 'sourceRevisionId']
print(gboost_source_title)
print(gboost_target_title)
print(gboost_tid)
print(gboost_source_revid)
# Translated sections for "United Arab Emarites takeover of socatra1" article
for translated_section in parallel_corpus:
    if translated_section['id'].split('/')[0] == gboost_tid:
        print(translated_section, '\n')

For United Arab Emirates takeover of socorta there is almost no parallel translation for Arabic Language although the origenal article on wikipedia fully translated to Arabic language.

# Page revision history United Arab Emirates takeover of socatra
revision_parameters = {
    "action": "query",
    "prop": "revisions",
    "titles": gboost_source_title,
    "rvprop": "timestamp|user|comment",
    "rvlimit": 100,
    "rvstartid": gboost_source_revid,
    "rvslots": "main",
    "formatversion": "2",
    "format": "json",
    "rvdir": "newer"
}
gboost_revisions = session.get(revision_parameters)
gboost_revisions

There is many modification here of the origenal article from 6 May\2018 to 19 Feb\2019 on many thinks like