Objective

This notebook serves two purposes:

  • To make some basic functions that may help in further analysis
  • To explore the variations in the percentages of human edits for different machine translation engines
import gzip
import json
import re
import pandas as pd
import mwapi
import random
from collections import Counter
#This function gets all meta data provided by cxpublishedtranslations API. Its output is a list.
# I will refer to the output of this function as "translationsMetaData" in following functions
# The function prints the offset every 10 000 offsets
def getAllMetaDataTranslations(fromLanguage ="en", toLanguage="ar"):
    session = mwapi.Session(host='https://en.wikipedia.org',
                        user_agent='mwapi (python) -- outreachy content translation')
    parameters = {'action':'query',
              'format':'json',
              'list':'cxpublishedtranslations',
              'from':fromLanguage,
              'to':toLanguage,
              'limit':500,
              'offset':0}
    res = session.get(parameters)
    offset = 500
    parameters["offset"] = offset
    resNext = session.get(parameters)
    
    while len(resNext['result']["translations"]) > 0:
        res['result']['translations'] += resNext['result']["translations"]
        offset += 500
        parameters["offset"] = offset
        resNext = session.get(parameters)
        # Print offset every 10 000 offsets
        if offset % 10000 == 0:
            print("Offset = {0}".format(offset) )
    return res["result"]["translations"]
# This function gets the list of sections translations from a dump file. fileName is a string of the name of the dump file
# I will refer to the output of this function as "translationsCorpora" in following functions
# The output of the function is a list
def getTranslationsCorpora(fileName:str):
    json_str = ""
    with gzip.open(fileName, 'rt') as fin:
        json_str = fin.read()
    # remove repetitive commas
    json_str = re.sub(',{2,}', ',', json_str)
    translationsCorpora = json.loads(json_str)
    return translationsCorpora
# This function gets the meta data of a translationId given list of translationsMetaData
def getMetaDataOfTranslationId(translationId:str, translationsMetaData:list):
    s = [x for x in translationsMetaData if x["translationId"]== translationId ]
    return s[0]

#This function gets a list of all sections translated from translationsCorpora for a given translationId
def getSectionsTranslatedOfTranslationId(translationId:str, translationsCorpora:list):
    s = [x for x in translationsCorpora if x["id"].split("/")[0]== translationId ]
    return s
    
#This function gets meta data and translated sections for a random translationId
def getRandomArticle(translationsMetaData:list, translationsCorpora:list):
    r = random.randrange(0, len(translationsCorpora) )
    r_translationId = translationsCorpora[r]["id"].split('/')[0]
    r_metaData = getMetaDataOfTranslationId(r_translationId, translationsMetaData) 
    sourceURL = r_metaData["sourceURL"]
    targetURL = getMetaDataOfTranslationId(r_translationId, translationsMetaData)["targetURL"]
    print("Translation ID: " + r_translationId)
    print("Source URL: " + r_metaData["sourceURL"])
    print("Target URL: " + r_metaData["targetURL"] )
    print("Machine: {0},  Human: {1},  Any: {2}".format(
        r_metaData["stats"]["mt"], r_metaData["stats"]["human"], r_metaData["stats"]["any"]) + "\n" )
    for index, sec in enumerate( getSectionsTranslatedOfTranslationId(r_translationId, translationsCorpora) ) :
        print("Section number {}".format(index+1)  )
        print("Source: " + sec["source"]["content"])
        if sec["mt"]:
            print("Machine {0}: ".format(sec["mt"]["engine"]) + sec["mt"]["content"])
        print("Target: " + sec["target"]["content"] + "\n")

Analysis of Machine Translation Engines

# This function gets the most used machine translation engine for all articles (translationId)s
#  Note that considering the most used engine for an article as the only engine used affects accuracy, but 
# it serves as an approximation in the current state
# The output of this function is a dictionary: {"<translationId>": "<mtEngine>"}
def getMtEnginesFromCorpora(translationsCorpora:list):
    mtEngines = {}
    for section in translationsCorpora:
        translationId = section["id"].split("/")[0]
        if section["mt"]:
            if not translationId in mtEngines.keys():
                mtEngines[translationId] = []
            mtEngines[translationId].append(section["mt"]["engine"])
    for translationId in mtEngines.keys():
        if len(mtEngines[translationId]) > 0:
            # To get the most comm used machine translation engine
            enginesCounter = Counter(mtEngines[translationId])
            mtEngines[translationId] = enginesCounter.most_common(1)[0][0]
        else:
            mtEngines[translationId] = None
    return mtEngines

#This function gets the stats dictionary ( {"any", "mt", "human"} ) for all articles
# The output of this function is a dictionary: {"<translationId>": {"any": <#>, "mt": <#>, "human": <#> } }
def getTranslationStatsFromMetaData(translationsMetaData:list):
    translationStats = {}
    for article in translationsMetaData:
        translationId = article["translationId"]
        translationStats[translationId] = article["stats"]
    return translationStats
# This function gets the statistics of machine translation engines used across two languages
# The input of the function: translationsCorpora: a list containing all sections translated between two languages
#                           translationsMetaData: a list containing percentages of mt, human, any for each article
# The output of this function is a pandas DataFrame containing the machine translations engines used in translation, 
# the number of articles that mainly used this engine for translation,
# average percentages of contribution of machine and human in translation

def getMtEnginesStatistics(translationsCorpora:list, translationsMetaData:list):
    mtEngines = getMtEnginesFromCorpora(translationsCorpora)
    translationStats = getTranslationStatsFromMetaData(translationsMetaData)
    mtEnginesStats = {}
    for translationId in mtEngines.keys():
        mtEngine = mtEngines[translationId]
        tStats = translationStats[translationId]
        if mtEngine:
            if not mtEngine in mtEnginesStats.keys():
                mtEnginesStats[mtEngine] = {}
            mtEnginesStats[mtEngine]["numArticles"] = mtEnginesStats[mtEngine].get("numArticles",0) + 1
            if (not tStats["any"] == 0) and (tStats["any"]) :
                # Add the ratio of ( machine translation percentage / any translation percentage)
                mtEnginesStats[mtEngine]["machineUsedPercent"] = mtEnginesStats[mtEngine].get(
                "machineUsedPercent",0) + tStats["mt"]/tStats["any"]
                
                # Add the ratio of ( human translation percentage / any translation percentage)
                mtEnginesStats[mtEngine]["humanUsedPercent"] = mtEnginesStats[mtEngine].get(
                "humanUsedPercent",0) + tStats["human"]/tStats["any"]
    # Get Average of "machineUsedPercent" and "humanUsedPercent"
    for mtEngine in mtEnginesStats.keys():
        mtEnginesStats[mtEngine]["machineUsedPercent"] = mtEnginesStats[mtEngine]["machineUsedPercent"
                                                                           ] / mtEnginesStats[mtEngine]["numArticles"]
        mtEnginesStats[mtEngine]["humanUsedPercent"] = mtEnginesStats[mtEngine]["humanUsedPercent"
                                                                           ] / mtEnginesStats[mtEngine]["numArticles"]
    #Convert mtEnginesStats to a Pandas DataFrame to view results in a table
    mtEnginesStatsList = []
    for mtEngine in mtEnginesStats.keys():
        # Include the machine translation engine if it is used at least for 10 articles
        if mtEnginesStats[mtEngine]["numArticles"] >= 10:
            mtEnginesStatsList.append([mtEngine, mtEnginesStats[mtEngine]["numArticles"], 
                               mtEnginesStats[mtEngine]["machineUsedPercent"], mtEnginesStats[mtEngine]["humanUsedPercent"]])
    mtEnginesStatsDF = pd.DataFrame(mtEnginesStatsList, 
                                   columns= ["Engine", "Number of Articles", 
                                             "Average Contribution of Engine's Translations", 
                                             "Average Contribution of Human's Translations"])
    mtEnginesStatsDF.sort_values(by=["Average Contribution of Engine's Translations"], ascending=False, inplace=True)
    return mtEnginesStatsDF

English to Arabic

enToArTranslationsMetaData = getAllMetaDataTranslations(fromLanguage="en", toLanguage="ar")
enToArTranslationsCoropra = getTranslationsCorpora("cx-corpora.en2ar.text.json.gz")
Offset = 10000
enToArMtEnginesStats = getMtEnginesStatistics(enToArTranslationsCoropra, enToArTranslationsMetaData)
enToArMtEnginesStats.head()
Engine Number of Articles Average Contribution of Engine's Translations Average Contribution of Human's Translations
1 Google 796 0.427700 0.572300
0 Yandex 5569 0.206159 0.793661
2 scratch 159 0.160902 0.832809

English to Spanish

enToEsTranslationsMetaData = getAllMetaDataTranslations(fromLanguage="en", toLanguage="es")
enToEsTranslationsCoropra = getTranslationsCorpora("cx-corpora.en2es.text.json.gz")
Offset = 10000
Offset = 20000
Offset = 30000
enToEsMtEnginesStats = getMtEnginesStatistics(enToEsTranslationsCoropra, enToEsTranslationsMetaData)
enToEsMtEnginesStats.head()
Engine Number of Articles Average Contribution of Engine's Translations Average Contribution of Human's Translations
2 Google 1412 0.450978 0.548314
1 Yandex 3082 0.267093 0.732907
0 Apertium 21436 0.211053 0.788807
3 scratch 156 0.065029 0.934971

Conclusion

From the results of the statistics of machine translation engines for ("en" to "ar") and ("en" to "es"), that generally Google translation machine engine requires less human editorial effort, and thus it provides better translation quality.

Choosing the best translation engine across all language pairs can be done using presented functions, but it requires the download of all dump files and the analysis can be done offline to save the uploading time.