Initialization

# Imports
import gzip
import json
import re

import mwapi
import pandas as pd

Some notes about the cxpublishedtranslations API:

  • Example and overview given here: https://www.mediawiki.org/wiki/Content_translation/Published_translations#List_of_published_source_and_target_titles
  • In the parameters, you may notice there is an 'offset'. The API will only return 500 results at a time and given that there are often more than 500 articles that were translated, this parameter tells the API which results you want. * For the cxpublishedtranslations API, the articles are sorted by the date they were translated, so the deeper into the dataset you go (the higher the offset), the newer the translations are.
  • We do not have full data for articles that were translated before 2016-01-22, so I chose an offset (20000) that would get me a list of articles translated after that date. You may have to try a few offsets to find one that works. If the results are empty, that probably means your offset is too high.
# Parameters for accessing the API that contains metadata about each translated article
session = mwapi.Session(host='https://en.wikipedia.org',
                        user_agent='mwapi (python) -- outreachy content translation')

# articles translated from English -> Spanish
parameters = {'action':'query',
              'format':'json',
              'list':'cxpublishedtranslations',
              'from':'en',
              'to':'es',
              'limit':500,
              'offset':20000}

Get set of translated articles to dig more deeply into

res = session.get(parameters)
res['result']['translations'][:10]
[{'translationId': '374660',
  'sourceTitle': 'End Records',
  'targetTitle': 'End Records',
  'sourceLanguage': 'en',
  'sourceRevisionId': '746876472',
  'targetRevisionId': '102215116',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/End Records',
  'targetURL': '//es.wikipedia.org/wiki/End Records',
  'publishedDate': '20170928221445',
  'stats': {'any': 1.1047708138447,
   'human': 1.1047708138447,
   'mt': 0,
   'mtSectionsCount': 0}},
 {'translationId': '374661',
  'sourceTitle': '60S ribosomal protein L10',
  'targetTitle': 'Proteína L10 de la unidad ribosómica 60S\xa0',
  'sourceLanguage': 'en',
  'sourceRevisionId': '797871239',
  'targetRevisionId': '102215432',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/60S ribosomal protein L10',
  'targetURL': '//es.wikipedia.org/wiki/Proteína L10 de la unidad ribosómica 60S\xa0',
  'publishedDate': '20170928223045',
  'stats': {'any': 0.27921220127297,
   'human': 0.27476882430647,
   'mt': 0.0044433769664945,
   'mtSectionsCount': 3}},
 {'translationId': '374663',
  'sourceTitle': 'Ned Washington',
  'targetTitle': 'Ned Washington',
  'sourceLanguage': 'en',
  'sourceRevisionId': '796534066',
  'targetRevisionId': '102215375',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/Ned Washington',
  'targetURL': '//es.wikipedia.org/wiki/Ned Washington',
  'publishedDate': '20170928222819',
  'stats': {'any': 1.0329474621549,
   'human': 1.0255268625705,
   'mt': 0.0074205995844464,
   'mtSectionsCount': 2}},
 {'translationId': '374674',
  'sourceTitle': 'Greek legislative election, 1936',
  'targetTitle': 'Elecciones parlamentarias de Grecia de 1936',
  'sourceLanguage': 'en',
  'sourceRevisionId': '797738429',
  'targetRevisionId': '102216616',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/Greek legislative election, 1936',
  'targetURL': '//es.wikipedia.org/wiki/Elecciones parlamentarias de Grecia de 1936',
  'publishedDate': '20170928233753',
  'stats': {'any': 0.74281709880869,
   'human': 0.7281009110021,
   'mt': 0.014716187806587,
   'mtSectionsCount': 2}},
 {'translationId': '374675',
  'sourceTitle': 'Maltese general election, 1927',
  'targetTitle': 'Elecciones generales de Malta de 1927',
  'sourceLanguage': 'en',
  'sourceRevisionId': '787160410',
  'targetRevisionId': '102216883',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/Maltese general election, 1927',
  'targetURL': '//es.wikipedia.org/wiki/Elecciones generales de Malta de 1927',
  'publishedDate': '20170928235113',
  'stats': {'any': 0.94422310756972,
   'human': 0.89322709163347,
   'mt': 0.050996015936255,
   'mtSectionsCount': 5}},
 {'translationId': '374719',
  'sourceTitle': 'Dançando',
  'targetTitle': 'Dançando',
  'sourceLanguage': 'en',
  'sourceRevisionId': '801281364',
  'targetRevisionId': '102222244',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/Dançando',
  'targetURL': '//es.wikipedia.org/wiki/Dançando',
  'publishedDate': '20170929060104',
  'stats': {'any': 0.86393027832443,
   'human': 0.85704245150408,
   'mt': 0.0068878268203542,
   'mtSectionsCount': 4}},
 {'translationId': '374751',
  'sourceTitle': 'BFR (rocket)',
  'targetTitle': 'BFR (cohete)',
  'sourceLanguage': 'en',
  'sourceRevisionId': '802912035',
  'targetRevisionId': '102223820',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/BFR (rocket)',
  'targetURL': '//es.wikipedia.org/wiki/BFR (cohete)',
  'publishedDate': '20170929080018',
  'stats': {'any': 0.014279624893436,
   'human': 0.00078147200909349,
   'mt': 0.013498152884342,
   'mtSectionsCount': 4}},
 {'translationId': '374858',
  'sourceTitle': 'MatrixNet',
  'targetTitle': 'MatrixNet',
  'sourceLanguage': 'en',
  'sourceRevisionId': '774931364',
  'targetRevisionId': '102229789',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/MatrixNet',
  'targetURL': '//es.wikipedia.org/wiki/MatrixNet',
  'publishedDate': '20170929150431',
  'stats': {'any': 0.61847988077496,
   'human': 0.55290611028316,
   'mt': 0.065573770491803,
   'mtSectionsCount': 4}},
 {'translationId': '374869',
  'sourceTitle': 'Gradient boosting',
  'targetTitle': 'Gradient boosting',
  'sourceLanguage': 'en',
  'sourceRevisionId': '801498395',
  'targetRevisionId': '102230661',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/Gradient boosting',
  'targetURL': '//es.wikipedia.org/wiki/Gradient boosting',
  'publishedDate': '20180524022255',
  'stats': {'any': 0.050171256336484,
   'human': 0.048417591450884,
   'mt': 0.0017536648856008,
   'mtSectionsCount': 3}},
 {'translationId': '374880',
  'sourceTitle': 'RNA Helicase A',
  'targetTitle': 'Helicasa de ARN A',
  'sourceLanguage': 'en',
  'sourceRevisionId': '722925790',
  'targetRevisionId': '102231462',
  'targetLanguage': 'es',
  'sourceURL': '//en.wikipedia.org/wiki/RNA Helicase A',
  'targetURL': '//es.wikipedia.org/wiki/Helicasa de ARN A',
  'publishedDate': '20170929162131',
  'stats': {'any': 0.34936381281001,
   'human': 0.34670404715693,
   'mt': 0.0026597656530803,
   'mtSectionsCount': 3}}]

Alternative view of the data via Pandas

df = pd.DataFrame(res['result']['translations'])
df.head(10)
publishedDate sourceLanguage sourceRevisionId sourceTitle sourceURL stats targetLanguage targetRevisionId targetTitle targetURL translationId
0 20170928221445 en 746876472 End Records //en.wikipedia.org/wiki/End Records {'any': 1.1047708138447, 'human': 1.1047708138... es 102215116 End Records //es.wikipedia.org/wiki/End Records 374660
1 20170928223045 en 797871239 60S ribosomal protein L10 //en.wikipedia.org/wiki/60S ribosomal protein L10 {'any': 0.27921220127297, 'human': 0.274768824... es 102215432 Proteína L10 de la unidad ribosómica 60S //es.wikipedia.org/wiki/Proteína L10 de la uni... 374661
2 20170928222819 en 796534066 Ned Washington //en.wikipedia.org/wiki/Ned Washington {'any': 1.0329474621549, 'human': 1.0255268625... es 102215375 Ned Washington //es.wikipedia.org/wiki/Ned Washington 374663
3 20170928233753 en 797738429 Greek legislative election, 1936 //en.wikipedia.org/wiki/Greek legislative elec... {'any': 0.74281709880869, 'human': 0.728100911... es 102216616 Elecciones parlamentarias de Grecia de 1936 //es.wikipedia.org/wiki/Elecciones parlamentar... 374674
4 20170928235113 en 787160410 Maltese general election, 1927 //en.wikipedia.org/wiki/Maltese general electi... {'any': 0.94422310756972, 'human': 0.893227091... es 102216883 Elecciones generales de Malta de 1927 //es.wikipedia.org/wiki/Elecciones generales d... 374675
5 20170929060104 en 801281364 Dançando //en.wikipedia.org/wiki/Dançando {'any': 0.86393027832443, 'human': 0.857042451... es 102222244 Dançando //es.wikipedia.org/wiki/Dançando 374719
6 20170929080018 en 802912035 BFR (rocket) //en.wikipedia.org/wiki/BFR (rocket) {'any': 0.014279624893436, 'human': 0.00078147... es 102223820 BFR (cohete) //es.wikipedia.org/wiki/BFR (cohete) 374751
7 20170929150431 en 774931364 MatrixNet //en.wikipedia.org/wiki/MatrixNet {'any': 0.61847988077496, 'human': 0.552906110... es 102229789 MatrixNet //es.wikipedia.org/wiki/MatrixNet 374858
8 20180524022255 en 801498395 Gradient boosting //en.wikipedia.org/wiki/Gradient boosting {'any': 0.050171256336484, 'human': 0.04841759... es 102230661 Gradient boosting //es.wikipedia.org/wiki/Gradient boosting 374869
9 20170929162131 en 722925790 RNA Helicase A //en.wikipedia.org/wiki/RNA Helicase A {'any': 0.34936381281001, 'human': 0.346704047... es 102231462 Helicasa de ARN A //es.wikipedia.org/wiki/Helicasa de ARN A 374880
# I'll use this example for the rest of the notebook
df[df['sourceTitle'] == 'Gradient boosting']
publishedDate sourceLanguage sourceRevisionId sourceTitle sourceURL stats targetLanguage targetRevisionId targetTitle targetURL translationId
8 20180524022255 en 801498395 Gradient boosting //en.wikipedia.org/wiki/Gradient boosting {'any': 0.050171256336484, 'human': 0.04841759... es 102230661 Gradient boosting //es.wikipedia.org/wiki/Gradient boosting 374869
gboost_source_title = df.loc[8, 'sourceTitle']
gboost_target_title = df.loc[8, 'targetTitle']
gboost_tid = df.loc[8,'translationId']
gboost_source_revid = df.loc[8, 'sourceRevisionId']
print(gboost_source_title)
print(gboost_target_title)
print(gboost_tid)
print(gboost_source_revid)
Gradient boosting
Gradient boosting
374869
801498395

Get corresponding parallel translation

Parallel translations can either be accessed through the dump files or API. Use the dump files if you are planning on analyzing the entire corpus (or a large proportion) of translated articles. The API is best for looking at a few examples.

Dump files

  • the dump files give you local access to the parallel translations and are the "friendly" way to access the data, especially if you are looking at a lot of examples.
  • download most recent .text.json.gz file based on instructions here: https://www.mediawiki.org/wiki/Content_translation/Published_translations#Dumps
  • upload dump file to PAWS (there is an upload button if you go to your dashboard (click on the PAWS logo in top right)
  • there is a bug where the dump files have extra commas that break the JSON schema and leads to an error if you call json.load directly on the fin variable. Instead, you have to remove them as below to load in the dump file.

API

# 1st option: Download dump to access all translated articles
json_str = ""
with gzip.open('cx-corpora.en2es.text.json.gz', 'rt') as fin:
    for line in fin:
        json_str += line.strip()
# remove repetitive commas
json_str = re.sub(',{2,}', ',', json_str)
parallel_corpus = json.loads(json_str)
# Example:
# id: a string composed of <translationID>/<sectionID>
# <translationID> is in the data accessed from the first API
# each section in the article that was translated gets its own <sectionID>
# mt indicates whether machine translation was used in the interface
print("Descriptive statistics:")
print("{0} translated sections.".format(len(parallel_corpus)))
print("{0} translated articles.".format(len(set(sec['id'].split('/')[0] for sec in parallel_corpus))))
mt_counts = {}
for sec in parallel_corpus:
    if sec['mt']:
        service = sec['mt']['engine']
        mt_counts[service] = mt_counts.get(service, 0) + 1
    else:
        mt_counts['no-mt'] = mt_counts.get('no-mt', 0) + 1
print("Machine translation services used: {0}".format(mt_counts))
print("\nExample:\n", parallel_corpus[0])
Descriptive statistics:
303040 translated sections.
28203 translated articles.
Machine translation services used: {'no-mt': 108098, 'Apertium': 163433, 'Yandex': 20242, 'Google': 9475, 'source-mt': 1, 'scratch': 1788, 'TestClient': 3}

Example:
 {'id': '10059/mwEg', 'sourceLanguage': 'en', 'targetLanguage': 'es', 'source': {'content': "Before New Montgomery Street was created, an inner street called Jane Street ran parallel to Second and Third Street.[1] In the 1870s, Montgomery Street South was established in its place as the southern extension of Montgomery Street, one of the main thoroughfares in San Francisco's Financial District, running north from Market to Telegraph Hill. The extension was strongly supported by businessman and Bank of California founder William Ralston – who started the construction of the original Palace Hotel, at the time the largest hotel in the Western United States – in an effort to expand San Francisco's business district to the yet undeveloped area south of Market.[2][3] Ralston's original plans to connect Montgomery Street to the waterfront failed due to the unwillingness of two property owners (governor Milton Latham and shipping baron John Parrott) to sell their mansions on Rincon Hill, which is why Montgomery Street South never got past Howard Street.[1][4]"}, 'mt': None, 'target': {'content': 'Antes de Montgomery Nuevo Calle estuvo creado, una calle interior llamó Jane Calle corrió paralela a Segundo y Tercera Calle.[1] En el @1870s, Sur de Calle del Montgomery estuvo establecido en su sitio como la extensión del sur de Montgomery Calle, uno del principal thoroughfares en el distrito Financiero de San Francisco, corriendo del norte de Mercado a Cerro de Telégrafo. La extensión era fuertemente apoyada por businessman y Banco de fundador de California William Ralston @– quién empezó la construcción del Hotel de Palacio original, en el tiempo el hotel más grande en los Estados Unidos Occidentales @– en un esfuerzo para expandir el distrito empresarial de San Francisco al todavía undeveloped sur de área de Mercado.[2][3] Ralston planes originales para conectar Montgomery Calle al waterfront falló debido al unwillingness de dos dueños de propiedad (gobernador Milton Latham y embarcando barón John Parrott) para vender sus mansiones en Rincon Cerro, el cual es por qué Montgomery Calle Al sur pasado conseguido nunca Howard Calle.[1][4]'}}
# Translated sections for "Gradient boosting" article
for translated_section in parallel_corpus:
    if translated_section['id'].split('/')[0] == gboost_tid:
        print(translated_section, '\n')
{'id': '374869/mwAQ', 'sourceLanguage': 'en', 'targetLanguage': 'es', 'source': {'content': 'Gradient boosting is a machine learning technique for regression and classification problems, which produces a prediction model in the form of an ensemble of weak prediction models, typically decision trees. It builds the model in a stage-wise fashion like other boosting methods do, and it generalizes them by allowing optimization of an arbitrary differentiable loss function.'}, 'mt': {'engine': 'Apertium', 'content': 'El gradiente que aumenta es una técnica de aprendizaje de la máquina para regresión y problemas de clasificación, el cual produce un modelo de predicción en la forma de un ensemble de modelos de predicción débil, típicamente árboles de decisión. Construye el modelo en una etapa-moda sensata como otros métodos de aumentar , y les generalice por dejar optimización de una función de pérdida diferenciable arbitraria.'}, 'target': {'content': 'El gradiente que aumenta es una técnica de aprendizaje automático\xa0utilizado para el análisis de la regresión y para problemas de clasificación estadística, el cual produce un modelo predictivo en la forma de un conjunto de modelos de predicción débiles, típicamente árboles de decisión. Construye el modelo de forma escalonada como lo hacen otros métodos de boosting ,\xa0y los generaliza permitiendo la optimización aribitraria de una función de pérdida diferenciable.'}} 

{'id': '374869/mwvQ', 'sourceLanguage': 'en', 'targetLanguage': 'es', 'source': {'content': 'Gradient boosting can be used in the field of learning to rank. The commercial web search engines Yahoo[12] and Yandex[13] use variants of gradient boosting in their machine-learned ranking engines.'}, 'mt': {'engine': 'Apertium', 'content': 'El gradiente que aumenta puede ser utilizado en el campo de aprender a rango. Los motores de búsqueda de web comerciales Yahoo[12] y Yandex[13] variantes de uso del gradiente que aumenta en su máquina-aprendido ranking motores.'}, 'target': {'content': 'La potenciación del gradiente puede ser utilizado en el campo de aprendizaje de clasificación. Los motores de búsqueda de web comerciales Yahoo[12] y Yandex[13]\xa0utilizan variantes de gradient boosting en sus motores de búsqueda.'}} 

{'id': '374869/undefined07679025be2a0b218e5c0', 'sourceLanguage': 'en', 'targetLanguage': 'es', 'source': {'content': 'See also'}, 'mt': {'engine': 'Apertium', 'content': 'Ve también'}, 'target': {'content': 'Ver también'}} 

{'id': '374869/mwCw', 'sourceLanguage': 'en', 'targetLanguage': 'es', 'source': {'content': 'The idea of gradient boosting originated in the observation by Leo Breiman[1] that boosting can be interpreted as an optimization algorithm on a suitable cost function. Explicit regression gradient boosting algorithms were subsequently developed by Jerome H. Friedman[2][3] simultaneously with the more general functional gradient boosting perspective of Llew Mason, Jonathan Baxter, Peter Bartlett and Marcus Frean.[4][5] The latter two papers introduced the abstract view of boosting algorithms as iterative functional gradient descent algorithms. That is, algorithms that optimize a cost function over function space by iteratively choosing a function (weak hypothesis) that points in the negative gradient direction. This functional gradient view of boosting has led to the development of boosting algorithms in many areas of machine learning and statistics beyond regression and classification.'}, 'mt': None, 'target': {'content': 'La idea de la potenciación del gradiente fue originado en la observación realizada por Leo Breiman[1] que el Boosting puede ser interpretado como un algoritmo de optimización en una función de coste adecuada. El gradiente de regresión explícito que aumenta los algoritmos fue posteriormente desarrollado por Jerome H. Friedman[2][3] Simultáneamente con el gradiente funcional más general que aumenta perspectiva de Llew Mason, Jonathan Baxter, Peter Bartlett y Marcus Frean.[4][5]\xa0Estos dos últimos documentos presentaron la visión abstracta de los algoritmos de aumento de potenciación como algoritmos iterativos de descenso de gradientes funcionales. Es decir, algoritmos que optimizan una función de coste sobre el espacio funcional mediante la elección iterativa de una función (hipótesis débil) que apunta en la dirección del gradiente negativo. Esta visión de gradiente funcional de potenciación ha llevado al desarrollo de algoritmos de potenciación en muchas áreas del aprendizaje automático y estadísticas más allá de la regresión y la clasificación.'}} 

# 2nd option: You could also retrieve the "Gradient boosting" sections through the API
# Create a new dictionary of parameters based on the API examples in the link above
# Get the results following the example at the start of the notebook

Analyses

Qualitative

Some starting examples include trying to better understand what happens to the translated article after it is created. The page history for every Wikipedia article is publicly available. Each article also has a corresponding talk page, in which editors might discuss the content on the page and other related items. If you are unfamiliar with how to access this content, see these overviews of how to access page history (https://en.wikipedia.org/wiki/Help:Page_history) and talk pages (https://en.wikipedia.org/wiki/Help:Talk_pages)

For example, for the English version of Gradient Boosting, these can be found at:

Go through the edit histories for a few articles and begin to identify whether any trends emerge about the types of edits that happen to translated articles. Compare the translated and source articles in their current state. What types of content were added after the translation? Are the articles diverging in content or staying similar? What sorts of discussions occur on the talk pages of translated articles?

Eventually we can do this in a more robust manner: more carefully choosing which articles to examine, developing more concrete questions to answer, building a code book for annotating article histories, content, or discussions, etc.

Quantitative Analyses

More data can be accessed about the translations and what occurred after them. Try comparing statistics about edits, pageviews, etc. between the source and translated versions of articles. More advanced analyses in a project might eventually compare translated articles with similar articles that were not translated or classify edits based upon their 'type' for more fine-grained analyses of what happens to translated articles.

You can programmatically access page views for source/translated pages:

You can access page history as detailed below:

# Page revision history example
revision_parameters = {
    "action": "query",
    "prop": "revisions",
    "titles": gboost_source_title,
    "rvprop": "timestamp|user|comment",
    "rvlimit": 100,
    "rvstartid": gboost_source_revid,
    "rvslots": "main",
    "formatversion": "2",
    "format": "json",
    "rvdir": "newer"
}
gboost_revisions = session.get(revision_parameters)
gboost_revisions
{'batchcomplete': True,
 'query': {'pages': [{'pageid': 26649339,
    'ns': 0,
    'title': 'Gradient boosting',
    'revisions': [{'user': '207.141.65.5',
      'anon': True,
      'timestamp': '2017-09-20T01:12:14Z',
      'comment': 'Inconsistent partial derivative notation.'},
     {'user': '132.230.77.200',
      'anon': True,
      'timestamp': '2017-10-12T13:36:50Z',
      'comment': '/* Algorithm */'},
     {'user': '132.230.77.200',
      'anon': True,
      'timestamp': '2017-10-12T13:44:05Z',
      'comment': '/* Algorithm */'},
     {'user': 'InternetArchiveBot',
      'timestamp': '2017-10-22T02:24:58Z',
      'comment': 'Rescuing 2 sources and tagging 0 as dead. #IABot (v1.6beta2)'},
     {'user': '88.215.113.85',
      'anon': True,
      'timestamp': '2017-11-04T14:22:11Z',
      'comment': 'I suggest to use also the indicator function as described in the corresponding article. As this makes it more consistent between wiki articles.'},
     {'user': '209.6.143.163',
      'anon': True,
      'timestamp': '2017-11-11T05:25:08Z',
      'comment': '/* Algorithm */'},
     {'user': '209.6.143.163',
      'anon': True,
      'timestamp': '2017-11-11T05:26:21Z',
      'comment': '/* Algorithm */'},
     {'user': '209.6.143.163',
      'anon': True,
      'timestamp': '2017-11-11T05:26:49Z',
      'comment': '/* Algorithm */'},
     {'user': '209.6.143.163',
      'anon': True,
      'timestamp': '2017-11-11T05:27:05Z',
      'comment': '/* Algorithm */'},
     {'user': '209.6.143.163',
      'anon': True,
      'timestamp': '2017-11-11T05:28:47Z',
      'comment': '/* Algorithm */'},
     {'user': '209.6.143.163',
      'anon': True,
      'timestamp': '2017-11-11T05:32:56Z',
      'comment': '/* Algorithm */'},
     {'user': '209.6.143.163',
      'anon': True,
      'timestamp': '2017-11-11T22:52:26Z',
      'comment': '/* Algorithm */'},
     {'user': 'DeBit',
      'timestamp': '2017-11-23T06:49:40Z',
      'comment': '/* Algorithm */ fix formulation error'},
     {'user': '109.252.1.3',
      'anon': True,
      'timestamp': '2017-12-04T10:43:13Z',
      'comment': '/* See also */'},
     {'user': '137.132.228.38',
      'anon': True,
      'timestamp': '2018-01-09T08:39:56Z',
      'comment': '/* Algorithm */'},
     {'user': '185.85.220.200',
      'anon': True,
      'timestamp': '2018-01-16T10:01:02Z',
      'comment': '/* Informal introduction */'},
     {'user': 'EduardoValle',
      'timestamp': '2018-02-23T14:01:18Z',
      'comment': 'Typography: m-dashes'},
     {'user': '79.136.20.42',
      'anon': True,
      'timestamp': '2018-02-25T14:31:58Z',
      'comment': ''},
     {'user': 'Dhruvh',
      'timestamp': '2018-03-16T23:21:35Z',
      'comment': '/* See also */'},
     {'user': '2601:200:C001:7EE0:38F9:3DCD:BA4C:5103',
      'anon': True,
      'timestamp': '2018-03-22T05:02:00Z',
      'comment': '/* See also */'},
     {'user': '194.186.207.245',
      'anon': True,
      'timestamp': '2018-06-01T14:33:18Z',
      'comment': '/* Algorithm */'},
     {'user': '208.185.128.26',
      'anon': True,
      'timestamp': '2018-06-14T22:37:16Z',
      'comment': '/* Algorithm */'},
     {'user': 'Parrt',
      'timestamp': '2018-06-20T21:10:35Z',
      'comment': '/* See also */'},
     {'user': 'Chafe66',
      'timestamp': '2018-07-19T20:28:25Z',
      'comment': '/* Informal introduction */ what was shown was squared error, not mean squared error'},
     {'user': 'Chafe66',
      'timestamp': '2018-07-19T20:29:53Z',
      'comment': '/* Informal introduction */'},
     {'user': 'Chafe66',
      'timestamp': '2018-07-19T20:34:25Z',
      'comment': '/* Informal introduction */ Cleaned up grammar and punctuation a bit'},
     {'user': 'Bender235',
      'timestamp': '2018-08-27T14:17:39Z',
      'comment': '/* Names */ et al. is short for [[et alii]].'},
     {'user': 'Bender235', 'timestamp': '2018-08-27T14:27:43Z', 'comment': ''},
     {'user': 'Bender235', 'timestamp': '2018-08-28T23:21:20Z', 'comment': ''},
     {'user': 'SimonDedman',
      'timestamp': '2018-08-31T22:55:35Z',
      'comment': '/* Names */ added other names used and citations thereof, plus R package note.'},
     {'user': 'SimonDedman',
      'timestamp': '2018-08-31T22:56:12Z',
      'comment': '/* Names */ added full stop'},
     {'user': 'SimonDedman',
      'timestamp': '2018-08-31T22:57:37Z',
      'comment': '/* Names */ moved punctuation around citations'},
     {'user': 'Mehranimanesh',
      'timestamp': '2018-09-19T16:00:27Z',
      'comment': '/* Regularization */'},
     {'user': 'Bender235',
      'timestamp': '2018-10-10T17:20:06Z',
      'comment': '/* Stochastic gradient boosting */'},
     {'user': 'Mark viking',
      'timestamp': '2018-10-10T17:26:19Z',
      'comment': 'Adding local [[Wikipedia:Short description|short description]]: "Machine learning technique" ([[User:Galobtter/Shortdesc helper|Shortdesc helper]])'},
     {'user': 'Daviddwd',
      'timestamp': '2018-10-10T18:02:00Z',
      'comment': 'formatting'},
     {'user': 'Vernanimalcula',
      'timestamp': '2018-12-10T10:00:47Z',
      'comment': "Added the ''Machine learning bar'' (the box on the top) and the category ''Classification algorithms'' the same way they are in the [[Boosting (machine learning)]] article"},
     {'user': '2003:E5:170D:B401:24FD:1C80:E4C5:D7CE',
      'anon': True,
      'timestamp': '2019-01-28T10:43:29Z',
      'comment': '/* See also */'},
     {'user': 'Citation bot',
      'timestamp': '2019-02-07T02:03:59Z',
      'comment': 'Alter: template type. Add: year, pmid, doi, pages, issue, volume, journal, chapter-format, chapter-url, isbn. Removed accessdate with no specified URL. Removed parameters. Formatted [[WP:ENDASH|dashes]]. | You can [[WP:UCB|use this bot]] yourself. [[WP:DBUG|Report bugs here]]. | [[WP:UCB|User-activated]].'},
     {'user': 'Justin Ormont',
      'timestamp': '2019-02-15T12:47:23Z',
      'comment': '/* Shrinkage */ linked [[learning rate]]'}]}]}}