1+1
2
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb
sb.set_style('whitegrid')

import requests
import json
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
from urllib.parse import urlparse, quote, unquote

from collections import Counter

# requires pip install tldextract
import tldextract

import networkx as nx 
import time
import os

Load supporting data

with open('page_title_list_old.json','r') as f:
    page_title_list_old = json.load(f)

with open('page_title_list_new.json','r') as f:
    page_title_list = json.load(f)

print("There are {0} total page titles in the old file".format(len(page_title_list_old)))
print("There are {0} total page titles".format(len(page_title_list)))
There are 89 total page titles in the old file
There are 169 total page titles

There are some bad images we should avoid because they appear frequently in many articles. Define them here.

bad_images = ['Crystal Clear','Ambox content','Blank template','Nuvola apps','Flag-map',
              'Italian Parliament yellow','Muslim Brotherhood Emblem.jpg','P military.png',
              'Society.png','Wikinews','Heckert GNU','Hyksos.jpg', 'Logo.png',
              'Blue iPod Nano.jpg','Exquisite-opera.png','Sitat','Bluetank','Cquote',
              'relief location map','Images.png','Image-silk','pog.png','Coldwar.png',
              'Aquote','Alcatel 9109HA','Ambox style','Siegel der TSK','Flowerpowerportfolio',
              'flag','emblem','coat','seal','crest','Fleche-defaut','Postscript-viewer-shaded',
              'Earthmap','Wiki logo-3','Searchtool','stub.png','Exquisite-','Hannover Gehry-Tower',
              'Hw-caesar.png','Hammer and sickle radius border','Hero of the Soviet Union medal',
              'Clio','P middle east','P La Liberte','Relief Map','Europe relief','Fifties jukebox',
              'Habs_un_headquarters'
             ]

Some basics for testing.

page_title = "2013 Egyptian coup d'état"
lang = 'en'

Data scraping functions

There are five sets of data scraping functions.:

  • Categories
  • Language links
  • Images
  • Revisions
  • External links
  • Image category memberships

We also use some helper functions:

  • chunk_link - Sometimes we want to pass multiple titles to the API, but there's a limit on how many we can do at a time. This function will break a list into a list of lists where each sub-list has a length no greater than some maximum size.
# http://stackoverflow.com/a/312464/1574687

def chunk_list(l,size=50): 
    """Yield successive n-sized chunks from l."""
    chunk_list = list()
    for i in range(0, len(l), size):
        chunk_list.append(l[i:i + size])
    return chunk_list

Wikidata scraping

 

Categories

We need to get all the members of categories, specifically to define the set of pages we will look at.

def get_category_members(category_title,lang='en'):
    """The function accepts a page_title and returns a list of category page members
    
    category_title - a string (including "Category:" prefix) of the category name
    lang - a string (typically two letter ISO 639-1 code) for the language edition,
        defaults to "en"
    
    Returns:
    members - a list containing strings of the page titles in the category
    
    """
    _S="https://{1}.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle={0}&cmprop=title&cmnamespace=0&cmlimit=500&format=json&formatversion=2".format(category_title,lang)
    json_response = requests.get(_S).json()

    members = list()

    if 'categorymembers' in json_response['query']:
        for member in json_response['query']['categorymembers']:
            members.append(member['title'])
            
    return members
def get_commons_category_memberships(page_title):
    """The function accepts a page_title and returns a list of category page members
    
    page_title - a string of the page name
    
    Returns:
    members - a list containing strings of the categories of which the page is a mamber
    
    """
    _S="https://commons.wikimedia.org/w/api.php?action=query&prop=categories&titles={0}&clprop=timestamp&clshow=!hidden&cllimit=500&format=json&formatversion=2".format(page_title)
    json_response = requests.get(_S).json()

    categories = list()

    if 'pages' in json_response['query']:
        if 'categories' in json_response['query']['pages'][0]:
            for category in json_response['query']['pages'][0]['categories']:
                title = category['title']#.split(':')[1]
                timestamp = category['timestamp']
                categories.append({title:timestamp})
            
    return categories
def get_category_memberships(category_title,lang='en'):
    """The function accepts a category_title and returns a list of category page members
    
    category_title - a string of the page name
    
    Returns:
    members - a list containing strings of the categories of which the page is a mamber
    
    """
    _S="https://{1}.wikipedia.org/w/api.php?action=query&prop=categories&titles={0}&clprop=timestamp&clshow=!hidden&cllimit=500&format=json&formatversion=2".format(category_title,lang)
    json_response = requests.get(_S).json()

    categories = list()

    if 'pages' in json_response['query']:
        if 'categories' in json_response['query']['pages'][0]:
            for category in json_response['query']['pages'][0]['categories']:
                title = category['title']#.split(':')[1]
                timestamp = category['timestamp']
                categories.append({title:timestamp})
            
    return categories
def get_commons_category_file_members(category_title):
    """The function accepts a category_title and returns a list of category page members
    
    category_title - a string (including "Category:" prefix) of the category name
    lang - a string (typically two letter ISO 639-1 code) for the language edition,
        defaults to "en"
    
    Returns:
    members - a list containing strings of the page titles in the category
    
    """
    _S="https://commons.wikimedia.org/w/api.php?action=query&list=categorymembers&cmtitle={0}&cmprop=title&cmnamespace=6&cmlimit=500&format=json&formatversion=2".format(category_title)
    json_response = requests.get(_S).json()
    members = list()
    subquery_member_list = json_response['query']['categorymembers']
    members += [i['title'] for i in subquery_member_list]
    
    while True:
        if 'continue' not in json_response:
            break
        else:
            query_continue = json_response['continue']['cmcontinue']
            query_string = "https://commons.wikimedia.org/w/api.php?action=query&list=categorymembers&cmtitle={0}&cmprop=title&cmnamespace=6&cmlimit=500&cmcontinue={1}&format=json&formatversion=2".format(category_title,query_continue)
            json_response = requests.get(query_string).json()
            subquery_member_list = json_response['query']['categorymembers']
            members += [i['title'] for i in subquery_member_list]
            
    return members

Testing

page_title = 'ملف:General Al-Sisi, announcing the removal of President Morsi.png'
lang = 'ar'
get_category_memberships(page_title,'ar')[0]
{'تصنيف:صور غير حرة': '2013-07-29T23:52:02Z'}
_S="https://{1}.wikipedia.org/w/api.php?action=query&prop=categories&titles={0}&clprop=timestamp&clshow=!hidden&cllimit=500&format=json&formatversion=2".format(page_title,lang)
json_response = requests.get(_S).json()
json_response
{'batchcomplete': True,
 'query': {'pages': [{'categories': [{'ns': 14,
      'timestamp': '2013-07-29T23:52:02Z',
      'title': 'تصنيف:صور غير حرة'}],
    'ns': 6,
    'pageid': 1739489,
    'title': 'ملف:General Al-Sisi, announcing the removal of President Morsi.png'}]}}

A combination of functions are needed to get all the links that connect language versions of the same article.

  • get_interlanguage_links -
  • get_page_outlinks -
  • get_outlink_translations -
  • get_interlanguage_link_usage -
def get_interlanguage_links(page_title,lang='en'):
    """The function accepts a page_title and returns a dictionary containing 
    the title of the page in its other languages
       
    page_title - a string with the title of the page on Wikipedia
    lang - a string (typically two letter ISO 639-1 code) for the language edition, 
        defaults to "en"
       
    Returns:
    langlink_dict - a dictionary keyed by lang codes and page title as values
    """
    
    query_string = "https://{1}.wikipedia.org/w/api.php?action=query&format=json&prop=langlinks&formatversion=2&titles={0}&llprop=autonym|langname&lllimit=500".format(page_title,lang)
    json_response = requests.get(query_string).json()
    
    interlanguage_link_dict = dict()
    interlanguage_link_dict['en'] = page_title

    if 'langlinks' in json_response['query']['pages'][0]:
        langlink_dict = json_response['query']['pages'][0]['langlinks']

        for d in langlink_dict:
            lang = d['lang']
            title = d['title']
            interlanguage_link_dict[lang] = title
            
    return interlanguage_link_dict
def get_page_outlinks(page_title,lang='en',redirects=1):
    """Takes a page title and returns a list of wiki-links on the page. The 
    list may contain duplicates and the position in the list is approximately 
    where the links occurred.
    
    page_title - a string with the title of the page on Wikipedia
    lang - a string (typically two letter ISO 639-1 code) for the language 
        edition, defaults to "en"
    redirects - 1 or 0 for whether to follow page redirects, defaults to 1
    
    Returns:
    outlinks_per_lang - a dictionary keyed by language returning a dictionary 
        keyed by page title returning a list of outlinks
    """
    
    # Replace spaces with underscores
    page_title = page_title.replace(' ','_')
    
    bad_titles = ['Special:','Wikipedia:','Help:','Template:','Category:','International Standard','Portal:','s:','File:','Digital object identifier','(page does not exist)']
    
    # Get the response from the API for a query
    # After passing a page title, the API returns the HTML markup of the current article version within a JSON payload
    req = requests.get('https://{2}.wikipedia.org/w/api.php?action=parse&format=json&page={0}&redirects={1}&prop=text&disableeditsection=1&disabletoc=1'.format(page_title,redirects,lang))
    
    # Read the response into JSON to parse and extract the HTML
    json_string = json.loads(req.text)
    
    # Initialize an empty list to store the links
    outlinks_list = [] 
    
    if 'parse' in json_string.keys():
        page_html = json_string['parse']['text']['*']

        # Parse the HTML into Beautiful Soup
        soup = BeautifulSoup(page_html,'lxml')

        # Delete tags associated with templates
        for tag in soup.find_all('tr'):
            tag.replace_with('')

        # For each paragraph tag, extract the titles within the links
        for para in soup.find_all('p'):
            for link in para.find_all('a'):
                if link.has_attr('title'):
                    title = link['title']
                    # Ignore links that aren't interesting or are redlinks
                    if all(bad not in title for bad in bad_titles) and 'redlink' not in link['href']:
                        outlinks_list.append(title)

        # For each unordered list, extract the titles within the child links
        for unordered_list in soup.find_all('ul'):
            for item in unordered_list.find_all('li'):
                for link in item.find_all('a'):
                    if link.has_attr('title'):
                        title = link['title']
                        # Ignore links that aren't interesting or are redlinks
                        if all(bad not in title for bad in bad_titles) and 'redlink' not in link['href']:
                            outlinks_list.append(title)

    return outlinks_list
def get_outlink_translations(outlinks_per_lang):
    translation_dict = dict()
    
    for lang,outlinks in outlinks_per_lang.items():
        unique_outlinks = list(set(outlinks))
        translation_dict[lang] = {}

        for page_title in unique_outlinks:
            if not any(i in page_title for i in ['n:','en:','wikt:']):
                query_string = "https://{1}.wikipedia.org/w/api.php?action=query&prop=langlinks&titles={0}&redirects=1&lllimit=500&format=json&formatversion=2".format(page_title,lang)
                json_response = requests.get(query_string).json()

                if 'pages' in json_response['query'].keys():
                    langlink_dict = json_response['query']['pages'][0]
                    translation_dict[lang][page_title] = {}

                    if 'langlinks' in langlink_dict.keys():

                        for ll in langlink_dict['langlinks']:
                            ll_title = ll['title']
                            ll_lang = ll['lang']
                            translation_dict[lang][page_title][ll_lang] = ll_title

                else:
                    print("There are no outlinks in {0} version of \"{1}\"".format(lang,page_title))
                    translation_dict[lang][page_title] = {}
                
    return translation_dict
def get_interlanguage_link_usage(page_title,lang='en'):
    """Takes a Wikipedia page title and return the interlanguage outlink dictionary
    
    page_title - a string with the title of the page on Wikipedia
    lang - a string (typically two letter ISO 639-1 code) for the language edition,
        defaults to "en"
    
    Returns:
    outlinks_per_lang - a dictionary keyed by language returning a dictionary keyed
        by page title returning a list of outlinks
    """
    outlinks_per_lang = {}

    language_titles = get_interlanguage_links(page_title)

    for lang,title in language_titles.items():
        try:
            outlinks_per_lang[lang] = get_page_outlinks(title,lang)
        
        except KeyboardInterrupt:
            raise
            
        except:
            print("Error getting outlinks in {0} version of \"{1}\"".format(lang,page_title))
            pass
        
    translation_dict = get_outlink_translations(outlinks_per_lang)
    
    return translation_dict
interlanguage_link_usage = get_interlanguage_link_usage(page_title)
for lang_title1,lang_links_dict in interlanguage_link_usage['ar'].items():
    if 'en' in lang_links_dict:
        print("{0} --> {1}".format(lang_title1,lang_links_dict['en']))
    else:
        print("\t{0} --> No link!".format(lang_title1))
تركيا --> Turkey
أحداث دار الحرس الجمهوري 2013 --> 2013 Republican Guard headquarters clashes
	بوابة:ثورات الربيع العربي --> No link!
باراك أوباما --> Barack Obama
عبد ربه منصور هادي --> Abdrabbuh Mansur Hadi
القاهرة --> Cairo
شبه جزيرة سيناء --> Sinai Peninsula
2013 --> 2013
2 يوليو --> July 2
	الجزيرة مباشر مصر --> No link!
مظاهرات 30 يونيو 2013 في مصر --> June 2013 Egyptian protests
الصين --> China
	اعتصام رابعة العدوية --> No link!
سوريا --> Syria
تونس --> Tunisia
الانتخابات الرئاسية المصرية 2012 --> Egyptian presidential election, 2012
شارة رابعة --> Rabia sign
حزب النور --> Al-Nour Party
مهدي عاكف --> Mohammed Mahdi Akef
بوابة:القانون --> Portal:Law
رشاد البيومي --> Rashad al-Bayumi
	بوابة:القوات المسلحة المصرية --> No link!
ميدان التحرير --> Tahrir Square
	قوات الأمن المركزي --> No link!
مصر 25 --> Misr 25
ألمانيا --> Germany
فرنسا --> France
الأردن --> Jordan
مدينة نصر --> Nasr City
أحمد داود أوغلو --> Ahmet Davutoğlu
بان كي مون --> Ban Ki-moon
دستور مصر 2012 --> Egyptian Constitution of 2012
اليمن --> Yemen
الجيش المصري --> Egyptian Armed Forces
جبهة الإنقاذ الوطني (مصر) --> National Salvation Front (Egypt)
المملكة المتحدة --> United Kingdom
القوات المسلحة المصرية --> Egyptian Armed Forces
السعودية --> Saudi Arabia
التحالف الوطني لدعم الشرعية --> Anti-Coup Alliance
محافظة الشرقية --> Sharqia Governorate
عدلي منصور --> Adly Mansour
أحمد الطيب --> Ahmed el-Tayeb
قناة الجزيرة --> Al Jazeera
حركة تمرد --> Tamarod
عبد المجيد محمود --> Abdel Meguid Mahmoud
تواضروس الثاني --> Pope Tawadros II of Alexandria
ويليام هيغ --> William Hague
نايل سات --> Nilesat
وزارة الداخلية المصرية --> Ministry of Interior (Egypt)
محمد البرادعي --> Mohamed ElBaradei
	كفر الشيخ --> No link!
كاترين أشتون --> Catherine Ashton
صلاة الفجر --> Fajr prayer
الإثنين --> Monday
السودان --> Sudan
ما بعد انقلاب 2013 في مصر --> Post-coup unrest in Egypt (2013–2014)
سامي عنان --> Sami Hafez Anan
المحكمة الدستورية العليا --> Constitutional court
قطاع غزة --> Gaza Strip
الأمم المتحدة --> United Nations
	حلمي الجزار --> No link!
انقلاب 3 يوليو 2013 في مصر --> 2013 Egyptian coup d'état
بوابة:مصر --> Portal:Egypt
محمد بديع --> Mohammed Badie
الجزيرة الإنجليزية --> Al Jazeera English
عبد الفتاح السيسي --> Abdel Fattah el-Sisi
بوابة:الحرب --> Portal:War
بشار الأسد --> Bashar al-Assad
سعد الكتاتني --> Saad El-Katatni
	المقطم (حي) --> No link!
	مجلس القضاء الأعلى --> No link!
فيسبوك --> Facebook
مصر --> Egypt
حزب الوسط المصري --> Al-Wasat Party
قناة الناس --> Al-Nas (TV station)
الاتحاد الأفريقي --> African Union
محمد مرسي --> Mohamed Morsi
	طلعت عبد الله --> No link!
	مصطفى حجازي --> No link!
المجلس الأعلى للقوات المسلحة --> Supreme Council of the Armed Forces
الولايات المتحدة --> United States
الاتحاد الأوروبي --> European Union
خيرت الشاطر --> Khairat el-Shater
الإخوان المسلمين --> Muslim Brotherhood
الإمارات العربية المتحدة --> United Arab Emirates
ثورة 25 يناير --> Egyptian revolution of 2011
قناة الحافظ --> Al-Hafez
	النيابة العامة (مصر) --> No link!
	قناة الرحمة الفضائية --> No link!
عبد الله بن عبد العزيز آل سعود --> Abdullah of Saudi Arabia
1 يوليو --> July 1
الإخوان المسلمون في مصر --> History of the Muslim Brotherhood in Egypt
المحكمة الدستورية العليا المصرية --> Supreme Constitutional Court (Egypt)
8 يوليو --> July 8
المنصف المرزوقي --> Moncef Marzouki
4 يوليو --> July 4
محمد كامل عمرو --> Mohamed Kamel Amr
حازم صلاح أبو إسماعيل --> Hazem Salah Abu Ismail
وكالة أنباء الشرق الأوسط --> MENA (news)
يوتيوب --> YouTube
3 يوليو --> July 3
الخميس --> Thursday
بوابة:السياسة --> Portal:Politics
معبر رفح --> Rafah Border Crossing
حزب الحرية والعدالة --> Freedom and Justice Party (Egypt)
بوابة:عقد 2010 --> Portal:2010s
محمد حسني مبارك --> Hosni Mubarak

Images

A combination of functions are needed to get all the image usage data we need.

  • get_page_images -
  • get_interlanguage_image_dict -
  • get_image_usage -
  • (deprecated) get_global_usage -
  • get_interlanguage_image_usage -
def get_page_images(page_title,lang='en'):
    """
    Takes a Wikipedia page title and returns a list of the image filenames
    
    page_title - a string with the title of the page on Wikipedia
    lang - a string (typically two letter ISO 639-1 code) for the language edition,
        defaults to "en"
    
    Returns:
    page_image_list - a list of strings, each string being a file name of an image
    """
    
    # Query string
    _S="https://{1}.wikipedia.org/w/api.php?action=query&format=json&titles={0}&formatversion=2&prop=images&imlimit=100".format(page_title,lang)
    
    # Make request
    json_response = requests.get(_S).json()
    
    # Only include these extensions
    extension_check = ['.png', '.jpg']
    
    page_image_list = []
    
    if 'images' in json_response['query']['pages'][0]:
        _image_list = json_response['query']['pages'][0]['images']
        
        for image in _image_list:
            if any(ext in image["title"] for ext in extension_check): 
                page_image_list.append(image["title"])  
   
        return page_image_list

    else:
        print('No images found in {0} version of "{1}"'.format(lang,page_title))
        return page_image_list
def get_interlanguage_image_dict(langlink_dict,bad_images):
    """
    Takes a dictionary keyed by language with a list of page names as values
    and a list of bad image names and returns a 
       
    langlink_dict - a dict generated by get_interlanguage_links
    bad_images - a list of strings containing filenames to ignore
    
    Returns:
    image_dict - a dict keyed by the language with a list of images appearing
         in the language version
    """
    image_dict = dict()
    
    if len(langlink_dict) > 0:
        for lang,article in langlink_dict.items():
            image_dict[lang] = list()
            lang_page_images = get_page_images(article,lang)
            for image in lang_page_images:
                if not any(bad.lower() in image.lower() for bad in bad_images):
                    if image not in image_dict[lang]:
                        image_dict[lang].append(image)
                    
    return image_dict
def get_image_usage(lang_image_dict):
    """
    The function accepts a dictionary of {lang:[image name]}
    and returns a dictionary containing the other languages in which they appear
    
    lang_image_dict - a dictionary generated by get_interlanguage_image_dict
    lang - a string (typically two letter ISO 639-1 code) for the language edition,
        defaults to "en"
    
    Return:
    lang_image_page_use - A dictionary keyed by language code returning a dictionary
        keyed by filename returning a list containing article titles where image appears
    """
    
    lang_image_page_use = dict()

    for lang,image_list in lang_image_dict.items():
        lang_image_page_use[lang] = {}
        for image in image_list:
            #cleaned_image = image.split(':')[1]
            #lang_image_page_use[lang][cleaned_image] = []
            lang_image_page_use[lang][image] = []
            query_string = "https://{1}.wikipedia.org/w/api.php?action=query&format=json&iutitle={0}&list=imageusage&iunamespace=0&iufilterredir=nonredirects&iulimit=500".format(image,lang)
            json_response = requests.get(query_string).json()
            if 'imageusage' in json_response['query']:
                image_page_use_list = json_response['query']['imageusage']
                for payload in image_page_use_list:
                    #lang_image_page_use[lang][cleaned_image].append(payload['title'])
                    lang_image_page_use[lang][image].append(payload['title'])
                    
            #if len(lang_image_page_use[lang][cleaned_image]) > 25:
            if len(lang_image_page_use[lang][image]) > 25:
                #print('Unusually high global usage on \"{0}\"'.format(cleaned_image))
                print('Unusually high global usage on \"{0}\"'.format(image))
                
    return lang_image_page_use

I don't trust the globalusage endpoint after a lot of testing and debugging:

def get_global_usage(page_image_list,lang='en'): """The function accepts a list of filenames and returns a dictionary containing the other languages in which they appear page_image_list - a list of file name lang - a string (typically two letter ISO 639-1 code) for the language edition, defaults to "en" Return: filelink_dict - A dictionary keyed by filename returning a dictionary keyed by language code returning a list containing the article titles in which the image appears """ _filelink_dict = dict() page_image_chunks = chunk_list(page_image_list,5) for image_chunk in page_image_chunks: query_string = "https://{1}.wikipedia.org/w/api.php?action=query&format=json&titles={0}&prop=globalusage&guprop=url|namespace&gulimit=500".format('ملف:Ikhwan-logo.jpg','ar') json_response = requests.get(query_string).json() for _id,payload in json_response['query']['pages'].items(): file_title = 'File:' + payload['title'].split(':')[1] if 'globalusage' in payload: _image_list = payload['globalusage'] # Empty dictionary to be keyed by language with a list of article titles as values clean_image_dict = {} for linked_image in _image_list: if 'ns' in linked_image and linked_image['ns'] == '0': title = linked_image['title'] lang = linked_image['wiki'].split('.')[0] if 'wikipedia.org' in linked_image['wiki']: if lang in clean_image_dict: clean_image_dict[lang].append(title) else: clean_image_dict[lang] = [title] if any(len(page_list) > 25 for file,page_list in clean_image_dict.items()): print("Unusually high global usage on {0}".format(file_title)) _filelink_dict[file_title] = clean_image_dict else: _filelink_dict[file_title] = clean_image_dict return _filelink_dict
def get_interlanguage_image_usage(page_title,bad_images):
    """ Takes a Wikipedia page title and get all the images in the
        other languages, and their global usage
    
    page_title - a string with the title of the page on Wikipedia
    bad_images - a list of strings to exclude from includsion
        
    Returns:
    global_file_pages - a dictionary keyed by filename 
    """
    
    langlink_dict = get_interlanguage_links(page_title)
    
    image_dict = get_interlanguage_image_dict(langlink_dict,bad_images)

    global_image_dict = get_image_usage(image_dict)
    
    return global_image_dict

Testing images

query_string = "https://{1}.wikipedia.org/w/api.php?action=query&format=json&titles={0}&prop=globalusage&guprop=url|namespace&gulimit=500".format(pipe_joined_images,lang)
json_response = requests.get(query_string).json()
json_response
langlink_dict = get_interlanguage_links(page_title_list[10])
lang_image_dict = get_interlanguage_image_dict(langlink_dict,bad_images)
global_image_dict = get_image_usage(lang_image_dict)
global_image_dict
Unusually high global usage on "ملف:Fifties jukebox.png"
{'ar': {'ملف:Fifties jukebox.png': ['1958',
   'ليبيا',
   'النمسا',
   'سنجان',
   '1956',
   'تورينغن',
   'تونس',
   'السودان',
   'غينيا',
   'غانا',
   'الحرب الباردة',
   'عرعر (مدينة)',
   'نادي الهلال (السعودية)',
   'دويتشه فيله',
   'بادن-فورتمبيرغ',
   'سارلاند',
   'ألماس صناعي',
   'آنهوي',
   'لاوس',
   'كمبوديا',
   'هاواي',
   'الجمهورية العربية المتحدة',
   'ألاسكا',
   'حركة فتح',
   'الساموراي السبعة',
   'نادي الوكرة',
   'مذبحة قبية',
   'الحماية الفرنسية في تونس',
   '1950',
   '1953',
   'إذاعة صوت العرب',
   'أولاد حارتنا',
   'نادي النصر (السعودية)',
   'راشومون',
   'أم قصر',
   'كيب لاوي',
   '1952',
   '1954',
   '1959',
   '1951',
   'عبد الكريم قاسم',
   '1955',
   'ماكدونالدز',
   'بوينغ 707',
   'إيتا',
   'حزب التحرير',
   'روزا باركس',
   'كامب نو',
   'حلف بغداد',
   'حرب فيتنام',
   'آتشيه',
   'جاوة الغربية',
   'يوغياكارتا (محافظة خاصة)',
   'كالمنتان الوسطى',
   'سيد الخواتم',
   'دوري أبطال أوروبا',
   'نادي الوحدات',
   'العراب (فيلم)',
   'القرن 20',
   'الحزب الوطني الباسوتي',
   'حزب الدعوة الإسلامية',
   'رابطة اللبلاب',
   'جامعة بنغازي',
   'جامعة الملك سعود',
   'مصرف ليبيا المركزي',
   'علم تشاد',
   'نادي الاتحاد (سوريا)',
   'الوكالة الدولية للطاقة الذرية',
   'شباب الحسين',
   'السودان الإنجليزي المصري',
   'مؤسسة النقد العربي السعودي',
   'أولمبيك ليون',
   'بنك المغرب',
   'نادي الجيش الملكي',
   'داربا',
   'لايتي الجنوبية',
   'باب الحديد (فيلم)',
   'نيو ساينتست',
   'جامعة عين شمس',
   'كيرلا',
   'كأس آسيا',
   'معهد جوته',
   'حلف وارسو',
   'مؤتمر الصومام',
   'الحماية الإسبانية على المغرب',
   'جائزة غرامي',
   'متحف السودان القومي',
   'مطار محمد الخامس الدولي',
   'جامعة محمد الخامس',
   'الخطوط الملكية المغربية',
   'سبوتنك-1',
   'سبوتنك-2',
   'عصر الفضاء',
   'جامعة بغداد',
   'كأس آسيا 1956',
   'مذبحة كفر قاسم',
   'نادي كربلاء',
   'ليفايس',
   'لوكهيد سي-130 هيركوليز',
   'بي-52 ستراتوفورتريس',
   'كابوتي (فيلم)',
   'جزر الأنتيل الهولندية',
   'الاتحاد الكويتي لكرة القدم',
   'طيران الخليج',
   'العدوان الثلاثي',
   'كأس رامون دي كارانزا',
   'شوتينغ ستارز',
   'برجر كنج',
   'الاتحاد الأوروبي لكرة القدم',
   'الحزب النازي الأمريكي',
   'ناسا',
   'نازغول',
   'الألعاب الأولمبية الصيفية 1952',
   'الألعاب الأولمبية الصيفية 1956',
   'علم ليبيا',
   'مبنى الرجل الميت',
   'الحرب الهندوصينية الأولى',
   'المخابرات العامة المصرية',
   'المدرسة المحمدية للمهندسين',
   'أبطال خارقون (فيلم)',
   'مصرف الراجحي',
   'مباراة الدماء في الماء',
   'الثورة المجرية 1956',
   'دجاج كنتاكي',
   'ملعب ماراكانا',
   'المملكة العراقية',
   'حركة تموز 1958',
   'لحن الوفاء (فيلم)',
   'ثورة 23 يوليو',
   'حزب الوطنيين الأحرار',
   'بن هور (فيلم 1959)',
   'العراب: الجزء الثاني (فيلم)',
   'مهرجان برلين السينمائي الدولي',
   'سوبارو',
   'بيتزا هت',
   'الاتحاد الآسيوي لكرة القدم',
   'جبهة التحرير الفلسطينية',
   'البرلمان الأوروبي',
   'الاتحاد العربي السعودي لكرة القدم',
   'منطقة التجارة العربية الحرة الكبرى',
   'لجنة أمن الدولة (الاتحاد السوفيتي)',
   'فولكس فاغن بيتل',
   'المملكة الليبية',
   'ميكويان-غوريفيتش ميغ-21',
   'سود للطيران',
   'نورد للطيران',
   'حزب التضامن السنغالي',
   'الثور الهائج',
   'ريذم أند بلوز',
   'جبهة التحرير الوطني الجزائرية',
   'الاتحاد الأفريقي لكرة القدم',
   'أتوميوم',
   'مشروع مارشال',
   'دوايت أيزنهاور',
   'جريدة غزة الأسبوعية',
   'بين القصرين (رواية)',
   'دينا (راقصة)',
   'العربي (مجلة)',
   'الخطوط الجوية الكويتية',
   'بنك الكويت الوطني',
   'فضيحة لافون',
   'مجموعة عبد الرحمن علي التركي',
   'كلباء',
   'رمز شريطي',
   'جزيرة عيد الميلاد',
   'جزر كوكوس',
   'بلاي بوي',
   'دعاء الكروان (فيلم)',
   'القوات الجوية الجزائرية',
   'إدارة الطيران الفيدرالية',
   'بنك إنترا',
   'لوفتهانزا',
   'كلية الحقوق (جامعة عين شمس)',
   'النجم الأولمبي لحلق الوادي والكرم',
   'ولاية باها كاليفورنيا',
   'بايكونور',
   'البحر والسم',
   'البنك الأهلي التجاري',
   'ديمونا',
   'الحرب الكورية',
   'بطاقة ائتمان',
   'مجموعة بن لادن',
   'وفاق سطيف',
   'فولاري',
   'شركة المقاولون العرب',
   'أوركسترا القاهرة السيمفوني',
   'الخلاص من شاوشانك',
   'الأمل الرياضي بحمام سوسة',
   'داسو ميراج الثالثة',
   'نادي الحزم',
   'الاتحاد اللاتيني',
   'إذاعة الجيش الإسرائيلي',
   'مؤتمر باندونغ',
   'مؤسسة التمويل الدولية',
   'بنك الإسكندرية',
   'كارناتاكا',
   'حزب الاستقلال والعمل',
   'حزب الحرية النمساوي',
   'فرودو باجنز',
   'نادي الوطني',
   'نادي الفيحاء (السعودية)',
   'نادي هجر',
   'نادي الفتح (السعودية)',
   'نادي الرياض',
   'أوراوا رد دايموندز',
   'خوارزمية بووث للضرب',
   'محافظة تشاكو',
   'نامكو',
   'سينما باراديزو الجديدة',
   'الجمهورية الفرنسية الخامسة',
   'الجمهورية الفرنسية الرابعة',
   'مطار بيروت رفيق الحريري الدولي',
   'نعيمة عاكف',
   'كأس العالم لكرة القدم 1950',
   'الكنيسة الإنجيلية المشيخية',
   'جنيه فلسطيني',
   'عطلة رومانية (فيلم)',
   'ثقافة السودان',
   'الألعاب الأولمبية الشتوية 1956',
   'الألعاب الأولمبية الشتوية 1952',
   'جسر الأئمة',
   'جميلة (فيلم)',
   'أبارتايد',
   'النافذة الخلفية (فيلم)',
   'نيشان الافتخار',
   'عبقرينو',
   'اتحاد البحرين لكرة القدم',
   'اتحاد لاوس لكرة القدم',
   'اتحاد منغوليا لكرة القدم',
   'اتحاد نيبال لكرة القدم',
   'اتحاد سنغافورة لكرة القدم',
   'سي فور (مادة)',
   'ديزني لاند',
   'شعبة الاستخبارات العسكرية الإسرائيلية',
   'الجامعة اللبنانية',
   'إني راحلة',
   'اتحاد الكاميرون لكرة القدم',
   'اتحاد غينيا الاستوائية لكرة القدم',
   'اتحاد غامبيا لكرة القدم',
   'اتحاد غانا لكرة القدم',
   'مسابقة يوروفيجن للأغاني',
   'سيد بطاطس',
   'اتحاد موريشيوس لكرة القدم',
   'الجامعة الملكية المغربية لكرة القدم',
   'اتحاد الصومال لكرة القدم',
   'الجامعة التونسية لكرة القدم',
   'كأس العالم لكرة القدم 1954',
   'كأس العالم لكرة القدم 1958',
   'وكالة الأمن القومي الأمريكية',
   'تاميل نادو',
   'أوراسكوم للإنشاء والصناعة',
   'التلفزيون المستقل',
   'مطار بودابست فرانز ليست الدولي',
   'بتروبراز',
   'القوات المسلحة الملكية المغربية',
   'أندرا برديش',
   'ماديا براديش',
   'بنجاب (الهند)',
   'راجستان',
   'جيش التحرير المغربي',
   'ألعاب البحر الأبيض المتوسط',
   'نادي الشباب (دبي)',
   'بريجيت باردو',
   'الجيش الألماني',
   'مؤسسة العلوم الوطنية السويسرية',
   'معهد الدراسات القبطية (مصر)',
   'متحف الآثار الأردني',
   'فريمونت (كاليفورنيا)',
   'كارلسباد (كاليفورنيا)',
   'علم النيجر',
   'ليمون مر',
   'غولدبيري',
   'هدسون هورنيت',
   'تومبسون',
   'عقل جميل (فيلم)',
   'كويز شو (فيلم)',
   'أنتريم الشمالية (دائرة انتخابية في المملكة المتحدة)',
   'داون الجنوبية (دائرة انتخابية في المملكة المتحدة)',
   'داون الشمالية (دائرة انتخابية في المملكة المتحدة)',
   'الخروج من الجنة',
   'مقاطعة جوسيفين (أوريغون)',
   'عقد 1950',
   'المجلس الأعلى للقضاء (المغرب)',
   'الأخبار (جريدة مصرية)',
   'جاكبوت',
   'بارادايس',
   'الشيخ والبحر',
   'معركة إنتشون',
   'نظام معاهدة القارة القطبية الجنوبية',
   'معركة محيط بوسان',
   'الحركة الشعبية',
   'تشايكور ريزه سبور',
   'سيرن',
   'جريدة الندوة',
   'مخيم بلاطة',
   'نادي حرس الحدود',
   'جامعة النيلين',
   'مودم',
   'المليونير (فيلم)',
   'كيكي لخدمة التوصيل',
   'حريق القاهرة',
   'إفني',
   'المؤسسة العامة للخطوط الحديدية',
   'دورة الألعاب العربية',
   'البنك المركزي التونسي',
   'نادي جبلة',
   'نادي الفتوة (سوريا)',
   'نادي الحرية (سوريا)',
   'نادي النصر (ليبيا)',
   'نادي الأخضر',
   'نادي الهلال (ليبيا)',
   'قصر الشوق',
   'الحزب الديمقراطي الكردي السوري',
   'نادي الوحدة (ليبيا)',
   'نادي التحدي',
   'نادي المدينة (ليبيا)',
   'نادي رفيق',
   'نادي وفاق صبراته',
   'نادي دارنس',
   'نادي الترسانة (ليبيا)',
   'نادي الشرارة',
   'تجربة ميلر-يوري',
   'عهد 54',
   'الاتحاد العربي',
   'جمهورية كاريليا',
   'قلميقيا',
   'الرابطة التونسية المحترفة الثانية لكرة القدم',
   'لاجوس كيززلر',
   'الحكومة الجزائرية المؤقتة',
   'أختوبينسك',
   'أمورسك',
   'بيليبينو',
   'تشايكوفسكي (بيرم كراي)',
   'دوبنا',
   'فوكينو (بريانسك أوبلاست)',
   'غي',
   'كاتشكانار',
   'ميرني (أوبلاست أرخانغلسك)',
   'ميرني (جمهورية ساخا)',
   'أوكتايابرسك',
   'بروتفينو',
   'شيلخوف',
   'سوسنوفاي بور',
   'سوفتسك (تولا أوبلاست)',
   'تريوخغورني',
   'فولغودونسك',
   'زابوليارني',
   'زاريتشني (بينزا أوبلاست)',
   'زارينسك',
   'زيلينوغراد',
   'جيليزنوغورسك (كورسك أوبلاست)',
   'النادي العربي (قطر)',
   'اتحاد جمهورية الدومنيكان لكرة القدم',
   'اتحاد غوادلوب لكرة القدم',
   'اتحاد مارتينيك لكرة القدم',
   'اتحاد جزر الأنتيل الهولندية لكرة القدم',
   'سد سامراء',
   'أحداث ساقية سيدي يوسف',
   'منتخب أيرلندا الشمالية لكرة القدم',
   'ملعب فرانسو حريري',
   'حرب إفني',
   'نادي السليمانية',
   'نادي الشباب (العراق)',
   'أوبلاست بيلغورود',
   'ليبيتسك أوبلاست',
   'ماغادان أوبلاست',
   'التلفزيون الجزائري (قناة تلفزيونية)',
   'كومنفورم',
   'مدرسة الطب في جامعة ديفيد غيفين (كاليفورنيا)',
   'جامعة فلوريدا كلية الطب',
   'جامعة ميامي ميلر مدرسة الطب',
   'جامعة كنتاكي كلية الطب',
   'جامعة يشيفا كلية طب ألبرت آينستاين',
   'بيان راسل-أينشتاين',
   'المفوضية العليا للأمم المتحدة لشؤون اللاجئين',
   'طبيبك (مجلة)',
   'الثلاثية (نجيب محفوظ)',
   'ياد فاشيم',
   'وكالة أنباء الشرق الأوسط',
   'جمعية الهلال الأحمر الجزائري',
   'السوق الأوروبية المشتركة',
   'الإمبراطور الأخير',
   'صمت القصور (فيلم)',
   'نادي الهلال الناضوري',
   'كارثة ميونخ الجوية',
   'نادي اتحاد طنجة',
   'نادي الرجاء الملالي',
   'جائزة دوق إدنبرة',
   'صباح الخير أيها الحزن (رواية)',
   'وسام نجمة الشرف (مصر)',
   'شباب امرأة (فيلم)',
   'الملاك الصغير (فيلم)',
   'المنزل رقم 13 (فيلم)',
   'حكومة عموم فلسطين',
   'فيروز هانم (فيلم)',
   'الأخ الكبير (فيلم)',
   'سيدة القطار (فيلم)',
   'الطريق المسدود (فيلم)',
   'حكاية حب (فيلم)',
   'وكالة الأنباء الكويتية',
   'قصص من التاريخ (كتاب)',
   'التوت البري (فيلم)',
   'ملعب العباسيين',
   'توايلايت زون',
   'شونن الأسبوعية (مجلة)',
   'الكلية الجوية (مصر)',
   'تلفزيون طوكيو',
   'كاسيو',
   'بحرية لبنانية',
   'بيلد',
   'الكلية الفنية العسكرية (مصر)',
   'أزمة سبوتنك',
   'الجمهورية (جريدة مصرية)',
   'المساء (جريدة مصرية)',
   'بطولة العالم لكرة اليد للسيدات',
   'بطولة العالم لكرة اليد للرجال 1954',
   'بطولة العالم لكرة اليد للرجال 1958',
   'تشوبوت (محافظة)',
   'يونايتد برس إنترناشيونال',
   'محافظة فورموزا',
   'ميسيونس (محافظة)',
   'نيوكوين (محافظة)',
   'ريو نيغرو (محافظة)',
   'محافظة سانتا كروز (الأرجنتين)',
   'منطقة طنجة الدولية',
   'الحماية الفرنسية على المغرب',
   'فورست غامب',
   'الوسادة الخالية (فيلم)',
   'ابن حميدو (فيلم)',
   'العربي الصغير (مجلة)',
   'الصدر (مدينة)',
   'الهيئة العامة للاستعلامات (مصر)',
   'بين الأطلال (فيلم)',
   'هروب الدجاج',
   'السؤال (كتاب)',
   'حركة الأرض',
   'كارفور',
   'إنترلنغوا',
   'جامعة بار إيلان',
   'أرض السلام (فيلم)',
   'اليد الحمراء',
   'ستاد بورسعيد',
   'المرأة المجهولة (فيلم)',
   'ثورة التحرير الجزائرية',
   'معركة الشوابير الغيشة الجزائر',
   'كهوف الصلب',
   'كلية الألسن (جامعة عين شمس)',
   'مكتبة تشستر بيتي',
   'جسر على نهر كواي (فيلم)',
   'البطولات الفرنسية 1959 (كرة مضرب)',
   'البطولات الفرنسية 1958 (كرة مضرب)',
   'البطولات الفرنسية 1957 (كرة مضرب)',
   'البطولات الفرنسية 1950 (كرة مضرب)',
   'البطولات الفرنسية 1951 (كرة مضرب)',
   'البطولات الفرنسية 1952 (كرة مضرب)',
   'البطولات الفرنسية 1953 (كرة مضرب)',
   'البطولات الفرنسية 1954 (كرة مضرب)',
   'البطولات الفرنسية 1955 (كرة مضرب)',
   'البطولات الفرنسية 1956 (كرة مضرب)',
   'ملكة جمال العالم',
   'نادي النيل (الحصاحيصا)',
   'الخرطوم الوطني',
   'آلبيركس نيغاتا',
   'سيريزو أوساكا',
   'كاواساكي فرونتال',
   'توكوشيما فورتيس',
   'أرسنال ساراندي',
   'أفلام والت ديزني',
   'نادي الاتحاد الوجدي',
   'جوني غيتار (فيلم)',
   'تقارير كينسي',
   'الشركة التونسية للملاحة',
   'التلفزيون الإسباني',
   'بابا أمين (فيلم)',
   'المفوضية الأوروبية',
   'نادي النصر (مصر)',
   'ثورة الشواف',
   'الأممية الاشتراكية',
   'سايبريس',
   'باراموونت (كاليفورنيا)',
   'بوينا بارك (كاليفورنيا)',
   'كاليفورنيا سيتي (كاليفورنيا)',
   'كامببيل (كاليفورنيا)',
   'كوستا ميسا (كاليفورنيا)',
   'كوبيرتينو (كاليفورنيا)',
   'ديل مار (كاليفورنيا)',
   'دوارتي (كاليفورنيا)',
   'فونتانا (كاليفورنيا)',
   'فوونتين فالي (كاليفورنيا)',
   'غروفير بيتش (كاليفورنيا)',
   'سيتي أوف إندوستري (كاليفورنيا)',
   'إرويندالي (كاليفورنيا)',
   'لا بوينتي (كاليفورنيا)',
   'لاغونا نيغويل (كاليفورنيا)',
   'لاكيووود (كاليفورنيا)',
   'لاوندالي (كاليفورنيا)',
   'لوس ألتوس (كاليفورنيا)',
   'لوس ألتوس هيلس (كاليفورنيا)',
   'مكفارلاند (كاليفورنيا)',
   'ميلبيتاس (كاليفورنيا)']},
 'en': {},
 'lv': {'Attēls:Adib al-Shishakli.jpg': ['Valsts apvērsums Sīrijā (1954)']},
 'ru': {}}

Revisions

def get_page_revisions(page_title,lang='en'):
    """Takes Wikipedia page title and returns a list of revisions
    
    page_title - a string with the title of the page on Wikipedia
    lang - a string (typically two letter ISO 639-1 code) for the language edition,
        defaults to "en"
        
    Returns:
    revision_list - a list of dictionaries, where each dictionary is the revision
        meta-data susch as parentid, revid,sha1, size, timestamp, and user name
    """
    
    revision_list = list()
    
    query_string = "https://{1}.wikipedia.org/w/api.php?action=query&titles={0}&prop=revisions&rvprop=ids|timestamp|user|size|sha1&rvlimit=500&rvdir=older&format=json&formatversion=2".format(page_title,lang)
    json_response = requests.get(query_string).json()
    subquery_revision_list = json_response['query']['pages'][0]['revisions']
    revision_list += subquery_revision_list
    
    while True:
    
        if 'continue' not in json_response:
            break
            
        else:
            query_continue = json_response['continue']['rvcontinue']
            query_string = "https://{1}.wikipedia.org/w/api.php?action=query&titles={0}&prop=revisions&rvprop=ids|timestamp|user|size|sha1&rvlimit=500&rvcontinue={2}&rvdir=older&format=json&formatversion=2".format(page_title,'en',query_continue)
            json_response = requests.get(query_string).json()
            subquery_revision_list = json_response['query']['pages'][0]['revisions']
            revision_list += subquery_revision_list
            #time.sleep(1)
    
    df = pd.DataFrame(revision_list)
    df['page'] = page_title
    #df['lang'] = lang
    
    return df
def get_interlanguage_revisions(page_title,lang='en'):
    """Takes a Wikipedia page title and return the interlanguage revision history
    
    page_title - a string with the title of the page on Wikipedia
    lang - a string (typically two letter ISO 639-1 code) for the language edition,
        defaults to "en"
    
    Returns:
    extlinks_per_lang - a dictionary keyed by language returning a dictionary
        keyed by page title returning a Counter dictionary of external links'
        top-level domains and counts
    """
    revisions_df_dict = {}

    language_titles = get_interlanguage_links(page_title,lang)

    for lang,title in language_titles.items():
        try:
            revisions_df_dict[lang] = get_page_revisions(title,lang)
        
        except KeyboardInterrupt:
            raise
            
        except:
            print("Error getting revisions in {0} version of \"{1}\"".format(lang,title))
            pass
    
    concat_df = pd.concat(revisions_df_dict.values(),keys=revisions_df_dict.keys(),
                          names=['lang','rev_num']).reset_index()
    
    return concat_df

Testing revisions

df = get_page_revisions(page_title)
interlanguage_revisions = get_interlanguage_revisions(page_title)
interlanguage_revisions
Error getting revisions in de version of "Militärputsch in Ägypten 2013"
lang rev_num anon page parentid revid sha1 size timestamp user
0 el 0 NaN Αιγυπτιακό πραξικόπημα 2013 5940387 5940999 941bb741e0c05af1f49e9a2ac30e430c16780905 2464 2016-07-17T18:12:05Z CubicStar
1 el 1 True Αιγυπτιακό πραξικόπημα 2013 5099766 5940387 ee736619d0950189506c0ce15453fda00125dc5d 2509 2016-07-17T10:30:14Z 62.74.13.200
2 el 2 NaN Αιγυπτιακό πραξικόπημα 2013 5099536 5099766 941bb741e0c05af1f49e9a2ac30e430c16780905 2464 2015-03-02T12:31:01Z IM-yb
3 el 3 NaN Αιγυπτιακό πραξικόπημα 2013 5099535 5099536 35a77e3b898cbf6840e35702a70ebd1cc0f09c4f 2483 2015-03-02T01:29:29Z IM-yb
4 el 4 NaN Αιγυπτιακό πραξικόπημα 2013 5099534 5099535 be1e377fd5a8c109dee4fe46b951ec2931874738 2474 2015-03-02T01:24:51Z IM-yb
5 el 5 NaN Αιγυπτιακό πραξικόπημα 2013 4808353 5099534 b547ad5950426059fcd716aa3990edd7aeec1bf2 2470 2015-03-02T01:24:39Z IM-yb
6 el 6 NaN Αιγυπτιακό πραξικόπημα 2013 4737488 4808353 5c129647cccc6952486d18412b44be2308f4bb7e 2198 2014-08-26T18:06:33Z Greek Scorpion
7 el 7 NaN Αιγυπτιακό πραξικόπημα 2013 4535642 4737488 d32a6076fa446f1611f5546d85d55f95c36d40c1 2198 2014-07-01T16:36:16Z Greek Scorpion
8 el 8 NaN Αιγυπτιακό πραξικόπημα 2013 4401633 4535642 ef96748aba8e8978c86e9ae2d54ad66b36c8712d 2199 2014-03-01T08:53:42Z Greek Scorpion
9 el 9 NaN Αιγυπτιακό πραξικόπημα 2013 4319888 4401633 f463cfc83d22cbeba88b2efbe774adfb5289370c 2216 2013-12-03T19:07:15Z Greek Scorpion
10 el 10 NaN Αιγυπτιακό πραξικόπημα 2013 4202824 4319888 4e5431d3d85f249114cf62b6165f12b1ead2f701 1897 2013-10-10T16:08:47Z C messier
11 el 11 NaN Αιγυπτιακό πραξικόπημα 2013 4202688 4202824 7e9700a9533d793e020e27ba11474b85d010510e 1616 2013-07-18T17:16:48Z Geraki
12 el 12 NaN Αιγυπτιακό πραξικόπημα 2013 4202643 4202688 8c538e448e53a48d39558f52abf2a5234f052cda 1710 2013-07-18T16:36:47Z Ttzavaras
13 el 13 True Αιγυπτιακό πραξικόπημα 2013 4186641 4202643 1d33fe92aace97e2b332e20d1174f91164e12b96 1704 2013-07-18T16:17:14Z 77.49.65.202
14 el 14 NaN Αιγυπτιακό πραξικόπημα 2013 4186464 4186641 b7d9c4cf756fe7bf27d6976ef3b5eceded3785a7 1725 2013-07-08T14:13:36Z Mikedelis
15 el 15 NaN Αιγυπτιακό πραξικόπημα 2013 0 4186464 f29e18fd083bdb192740bff1e09e026f88639b18 1675 2013-07-08T11:48:17Z SmartEd01
16 bg 0 NaN Държавен преврат в Египет (2013 г.) 6876196 7458521 cc0cbeea6ee6ea2f287e30f912f54592544f95db 4852 2016-09-22T14:26:41Z BotNinja
17 bg 1 NaN Държавен преврат в Египет (2013 г.) 6876191 6876196 4c4a950a75b3033214f1d0db37194bd5a89ba46f 4863 2015-09-22T05:33:42Z Elkost
18 bg 2 NaN Държавен преврат в Египет (2013 г.) 6872829 6876191 a2fcb2fab2aaadba804053e6d521bb7682b7cfd8 4807 2015-09-22T05:30:10Z Elkost
19 bg 3 NaN Държавен преврат в Египет (2013 г.) 6872826 6872829 ff3993219d28087de33d67b6cb645d0437936a4b 4754 2015-09-20T08:49:17Z Александър
20 bg 4 NaN Държавен преврат в Египет (2013 г.) 6872644 6872826 d366bf1293df5a9692304d4d8c379ee85ae187fc 4504 2015-09-20T08:44:04Z Александър
21 bg 5 NaN Държавен преврат в Египет (2013 г.) 6872643 6872644 38900a3e9ca5d945794069359ffb1d8eddb607b6 4261 2015-09-20T02:48:46Z Александър
22 bg 6 NaN Държавен преврат в Египет (2013 г.) 6872207 6872643 0e6356105f06edcc57b05588d7cce73b9b9e4447 3957 2015-09-20T02:42:14Z Александър
23 bg 7 NaN Държавен преврат в Египет (2013 г.) 6872151 6872207 77f268bb7ed5fe8246b5f20c22c981292deb6ab9 3657 2015-09-19T19:26:04Z V111P
24 bg 8 NaN Държавен преврат в Египет (2013 г.) 6872146 6872151 6fc7cf5f1c7048c6aad88794ca5e9a6f221f7ed2 3298 2015-09-19T18:52:46Z Александър
25 bg 9 NaN Държавен преврат в Египет (2013 г.) 0 6872146 6fc7cf5f1c7048c6aad88794ca5e9a6f221f7ed2 3298 2015-09-19T18:49:30Z Александър
26 ja 0 NaN 2013年エジプトクーデター 60069164 62549794 4fc835f47027ed51c31c8a98fb5a4d2da154946e 128291 2017-01-05T23:28:49Z Kalz
27 ja 1 NaN 2013年エジプトクーデター 57484956 60069164 857a8b94f978dc459468553ad0b9c54b9c4ac0dc 128248 2016-06-13T14:20:42Z JapaneseA
28 ja 2 NaN 2013年エジプトクーデター 55668190 57484956 89be1d20f5080c6ffa5a7155c59802c206bc2b8a 128261 2015-11-10T03:41:58Z Mercurius
29 ja 3 True 2013年エジプトクーデター 55614520 55668190 d1670e1e329d578648eb30bbfc49ea12d581e9b1 128321 2015-05-27T08:44:42Z 117.18.167.28
... ... ... ... ... ... ... ... ... ... ...
4731 fa 45 NaN کودتای ۲۰۱۳ مصر 10457823 10458020 67f90ab1c33d04ddb346514bc6fd43114f459681 10944 2013-07-04T19:44:16Z Gire 3pich2005
4732 fa 46 NaN کودتای ۲۰۱۳ مصر 10457459 10457823 dbd67c86bd437a9fe8825c9921d22f26b3c54eb7 10125 2013-07-04T18:24:41Z XerxesII
4733 fa 47 NaN کودتای ۲۰۱۳ مصر 10457337 10457459 499b4d295d9f960ed73af5be73754dc30194f781 9952 2013-07-04T16:17:20Z Farvartish
4734 fa 48 True کودتای ۲۰۱۳ مصر 10456589 10457337 f2ab0050ddc7210812e31c640047e5c1a63edd2f 10249 2013-07-04T15:24:39Z 178.79.186.74
4735 fa 49 NaN کودتای ۲۰۱۳ مصر 10456571 10456589 499b4d295d9f960ed73af5be73754dc30194f781 9952 2013-07-04T11:29:52Z Xerxessenior
4736 fa 50 NaN کودتای ۲۰۱۳ مصر 10456451 10456571 2e13871c330224d89296ce1c1c1c4c243aa8bfe5 9847 2013-07-04T11:23:12Z Xerxessenior
4737 fa 51 NaN کودتای ۲۰۱۳ مصر 10455723 10456451 36bd76fa7b7b428601822dd761ac2e01ac209fb2 9506 2013-07-04T10:54:39Z Xerxessenior
4738 fa 52 NaN کودتای ۲۰۱۳ مصر 10455719 10455723 820902e58f5b17b487d805eb22702f9ebc6602f8 9360 2013-07-04T06:13:30Z Alborzagros
4739 fa 53 NaN کودتای ۲۰۱۳ مصر 10455584 10455719 e4b018dcb3844da6f537a8f968cd942ba365f7be 9322 2013-07-04T06:12:51Z Alborzagros
4740 fa 54 NaN کودتای ۲۰۱۳ مصر 10455582 10455584 820902e58f5b17b487d805eb22702f9ebc6602f8 9360 2013-07-04T05:34:59Z Alborzagros
4741 fa 55 NaN کودتای ۲۰۱۳ مصر 10455576 10455582 e4b018dcb3844da6f537a8f968cd942ba365f7be 9322 2013-07-04T05:34:36Z Alborzagros
4742 fa 56 NaN کودتای ۲۰۱۳ مصر 10455575 10455576 e5e291b62b07dfeb404c5b4f37d698ea91fe0f57 9262 2013-07-04T05:33:20Z Hafez
4743 fa 57 NaN کودتای ۲۰۱۳ مصر 10455574 10455575 9323fd592edcdfe1a7cf0b05a42608d44c616289 9249 2013-07-04T05:33:12Z Alborzagros
4744 fa 58 NaN کودتای ۲۰۱۳ مصر 10455573 10455574 4f72d967d0f37b1c058a28b0a7936fa3a8bf31f7 9296 2013-07-04T05:33:02Z Alborzagros
4745 fa 59 NaN کودتای ۲۰۱۳ مصر 10455572 10455573 c4607dc5eae588ce74d74823042e1a488e705392 9340 2013-07-04T05:32:53Z Alborzagros
4746 fa 60 NaN کودتای ۲۰۱۳ مصر 10455571 10455572 ea4e61430e8c6b17c1228e66de886e8ec00fbe32 9371 2013-07-04T05:32:40Z Alborzagros
4747 fa 61 NaN کودتای ۲۰۱۳ مصر 10455569 10455571 6246e7fe03108ae456568a951bb450db2803e020 9413 2013-07-04T05:32:21Z Alborzagros
4748 fa 62 NaN کودتای ۲۰۱۳ مصر 10455562 10455569 09330936c6db8be89822eccab275244a0d108f02 9484 2013-07-04T05:31:55Z Alborzagros
4749 fa 63 NaN کودتای ۲۰۱۳ مصر 10455558 10455562 5a49742539a5e6e6668a7cf88b2e95a7187d00c4 9482 2013-07-04T05:29:09Z MahdiBot
4750 fa 64 NaN کودتای ۲۰۱۳ مصر 10455554 10455558 cbb3f2bb27f47cbd0a8c2418db5f56dc29d2a534 9243 2013-07-04T05:25:45Z Alborzagros
4751 fa 65 NaN کودتای ۲۰۱۳ مصر 10455495 10455554 8d38fca17b8c900514bc97ad693aae122a0ef4a8 9175 2013-07-04T05:22:32Z Alborzagros
4752 fa 66 NaN کودتای ۲۰۱۳ مصر 10455124 10455495 8d38fca17b8c900514bc97ad693aae122a0ef4a8 9175 2013-07-04T04:32:06Z Anvar11
4753 fa 67 NaN کودتای ۲۰۱۳ مصر 10455120 10455124 1352738cdc50b9988bbff5d3d5b61bd3292f346c 9120 2013-07-03T22:19:08Z XerxesII
4754 fa 68 NaN کودتای ۲۰۱۳ مصر 10455107 10455120 13486a0ea84f5973cbcbe93d86fd8755f62f05cf 9081 2013-07-03T22:17:14Z XerxesII
4755 fa 69 NaN کودتای ۲۰۱۳ مصر 10455105 10455107 edafc26a0cb4831bb06f9cb5a606000e2a5308d0 9041 2013-07-03T22:09:05Z XerxesII
4756 fa 70 NaN کودتای ۲۰۱۳ مصر 10455099 10455105 5836a8a45083fb35d6723cd524dbe987c830391a 9046 2013-07-03T22:08:33Z XerxesII
4757 fa 71 NaN کودتای ۲۰۱۳ مصر 0 10455099 120d9f2b717e2b511401bcf3679645cb52d62dcc 1806 2013-07-03T22:05:26Z XerxesII
4758 az 0 NaN Misirdə hərbi çeviriliş (2013) 2738514 2738537 9042fbd019fbe1531b715a1724f4fb00dc99fe7c 4243 2013-08-30T19:55:13Z Babək Akifoğlu
4759 az 1 NaN Misirdə hərbi çeviriliş (2013) 2738512 2738514 b9866729785387900c02bf74c81c495f1d00bede 4230 2013-08-30T18:52:20Z Allahverdi hesenov
4760 az 2 NaN Misirdə hərbi çeviriliş (2013) 0 2738512 4bb052bd0f1752df271f240581dc8e44e0e11993 4235 2013-08-30T18:50:38Z Allahverdi hesenov

4761 rows × 10 columns

revisions_df_dict = {}

for page in page_title_list[:3]:
    extlink_usage = get_interlanguage_revisions(page_title=page)

    revisions_df_dict[page] = extlink_usage
def get_external_links(page_title,lang='en'):
    external_links = list()
    
    query_string = "https://{1}.wikipedia.org/w/api.php?action=query&titles={0}&prop=extlinks&ellimit=500&format=json&formatversion=2".format(page_title,lang)
    json_response = requests.get(query_string).json()
    
    if 'missing' not in json_response['query']['pages'][0] and 'extlinks' in json_response['query']['pages'][0]:
        extlinks = json_response['query']['pages'][0]['extlinks']
        
        # Clean the extlinks
        cleaned_extlinks = list()
        
        for l in extlinks:
            if 'web.archive.org' in l['url']: # Internet Archives have two https in them, get the second
                raw_url = 'http://' + l['url'].split('/http://')[1]
            else:
                raw_url = l['url']
            
            # Try to use the tldextract function, otherwise fall back to urlparse
            try:
                netloc = "{0}.{1}".format(tldextract.extract(raw_url).domain, tldextract.extract(raw_url).suffix)
            except:
                netloc = urlparse(raw_url).netloc
                
            external_links.append(netloc)
    
    return external_links
def get_interlanguage_extlinks_usage(page_title,lang='en'):
    """Takes a Wikipedia page title and return the interlanguage reference dictionary
    
    page_title - a string with the title of the page on Wikipedia
    lang - a string (typically two letter ISO 639-1 code) for the language edition,
        defaults to "en"
    
    Returns:
    extlinks_per_lang - a dictionary keyed by language returning a dictionary
        keyed by page title returning a Counter dictionary of external links'
        top-level domains and counts
    """
    extlinks_per_lang = {}

    language_titles = get_interlanguage_links(page_title,lang)

    for lang,title in language_titles.items():
        try:
            extlinks_per_lang[lang] = Counter(get_external_links(title,lang))
        
        except KeyboardInterrupt:
            raise
            
        except:
            print("Error getting extlinks in {0} version of \"{1}\"".format(lang,title))
            pass
    
    return extlinks_per_lang
interlanguage_extlinks = get_interlanguage_extlinks_usage(page_title)
interlanguage_extlinks

Scrape the data

Now the fun part where we extract all the data. These steps may take several hours to complete, will generate hundreds of files (JSON and image), and may take >100MB on your local machine.

Scrape category members

This should only need to be run once to get the page_title_list we use at the start.

Commenting out to avoid changes.

categories_by_decade = ["Category:1950s coups d'état and coup attempts","Category:1960s coups d'état and coup attempts", "Category:1970s coups d'état and coup attempts","Category:1980s coups d'état and coup attempts", "Category:1990s coups d'état and coup attempts","Category:2000s coups d'état and coup attempts", "Category:2010s coups d'état and coup attempts"] page_titles = list() for category in categories_by_decade: decade_members = get_category_members(category) for member in decade_members: if member not in page_titles: page_titles.append(member) keywords = ['coup','incident','uprising','crisis','conspiracy','revolution', 'putsch','operation','incident','insurrection','plot','project'] #page_title_list = [i for i in page_titles if any(j.lower() in [k.lower() for k in i.split(' ')] for j in keywords)] page_title_list = [i for i in page_titles if any(j.lower() in i.lower() for j in keywords)] print("There are {0} total page titles".format(len(page_title_list))) with open('page_title_list_new.json','w') as f: json.dump(page_title_list,f)
coup_interlanguage_links = dict()

for coup_title in page_title_list:
    print("...Starting interlanguage link retrieval for {0}".format(coup_title))
    try:
        interlanguage_link_usage = get_interlanguage_link_usage(coup_title)
        coup_interlanguage_links[coup_title] = interlanguage_link_usage
        with open("./Language links/{0}.json".format(coup_title),'w') as f:
            json.dump(interlanguage_link_usage,f)
        
    except KeyboardInterrupt:
        raise
        
    except:
        print("Error on retrieving interlanguage link usage for \"{0}\"".format(coup_title))
        pass
    
with open("coup_interlanguage_links.json",'w') as f:
        json.dump(coup_interlanguage_links,f)
...Starting interlanguage link retrieval for 1953 Colombian coup d'état
...Starting interlanguage link retrieval for 1954 Guatemalan coup d'état
...Starting interlanguage link retrieval for Coup d'état of Yanaon
...Starting interlanguage link retrieval for 1953 Iranian coup d'état
There are no outlinks in simple version of "wikt:nickname"
There are no outlinks in simple version of "wikt:resign"
There are no outlinks in simple version of "wikt:company"
There are no outlinks in simple version of "wikt:vote"
There are no outlinks in simple version of "wikt:replace"
There are no outlinks in simple version of "wikt:anger"
There are no outlinks in simple version of "wikt:revolution"
There are no outlinks in simple version of "wikt:refuse"
...Starting interlanguage link retrieval for 1958 Iraqi coup d'état
...Starting interlanguage link retrieval for 1959 Mosul uprising
...Starting interlanguage link retrieval for July 1958 Haitian coup d'état attempt
...Starting interlanguage link retrieval for May 1958 crisis in France
...Starting interlanguage link retrieval for 1958 Pakistani coup d'état
...Starting interlanguage link retrieval for Rawalpindi conspiracy
...Starting interlanguage link retrieval for 1954 Syrian coup d'état
...Starting interlanguage link retrieval for Silent Coup (Thailand)
...Starting interlanguage link retrieval for 1958 Venezuelan coup d'état
...Starting interlanguage link retrieval for 1960 Turkish coup d'état
There are no outlinks in en version of "tr:Namık Gedik"
There are no outlinks in ar version of "tr:Nam?k Gedik"
...Starting interlanguage link retrieval for 1961 Syrian coup d'état
...Starting interlanguage link retrieval for 1962 Burmese coup d'état
...Starting interlanguage link retrieval for 1963 Syrian coup d'état
...Starting interlanguage link retrieval for 1963 Togolese coup d'état
...Starting interlanguage link retrieval for 1964 Bolivian coup d'état
...Starting interlanguage link retrieval for 1965 Burundian coup d'état attempt
...Starting interlanguage link retrieval for July 1966 Burundian coup d'état
...Starting interlanguage link retrieval for November 1966 Burundian coup d'état
...Starting interlanguage link retrieval for 1966 Nigerian coup d'état
...Starting interlanguage link retrieval for 1966 Republic of the Congo coup d'état attempt
...Starting interlanguage link retrieval for 1966 Syrian coup d'état
There are no outlinks in en version of "wikt:rank and file"
...Starting interlanguage link retrieval for 1968 Republic of the Congo coup d'état
...Starting interlanguage link retrieval for 1969 Libyan coup d'état
...Starting interlanguage link retrieval for 1969 Sudanese coup d'état
...Starting interlanguage link retrieval for Algiers putsch of 1961
There are no outlinks in ja version of "fr:Chérif Sid Cara"
There are no outlinks in ja version of "fr:1er régiment étranger de parachutistes"
There are no outlinks in en version of "fr:Patrick Pesnot"
There are no outlinks in es version of "fr:Putsch des Généraux"
...Starting interlanguage link retrieval for Argentine Revolution
...Starting interlanguage link retrieval for 1964 Brazilian coup d'état
...Starting interlanguage link retrieval for Bulgarian coup d'état attempt of 1965
...Starting interlanguage link retrieval for Saint-Sylvestre coup d'état
...Starting interlanguage link retrieval for 1962 Ceylonese coup d'état attempt
...Starting interlanguage link retrieval for 1966 alleged Ceylonese coup d'état attempt
...Starting interlanguage link retrieval for 1960 Ethiopian coup attempt
...Starting interlanguage link retrieval for 1964 Gabon coup d'état
...Starting interlanguage link retrieval for Operation Guitar Boy
...Starting interlanguage link retrieval for 1963 Honduran coup d'état
...Starting interlanguage link retrieval for Hukou incident
...Starting interlanguage link retrieval for 17 July Revolution
...Starting interlanguage link retrieval for November 1963 Iraqi coup d'état
...Starting interlanguage link retrieval for Ramadan Revolution
...Starting interlanguage link retrieval for May 16 coup
There are no outlinks in ru version of "no:Chang Do-yong"
There are no outlinks in zh version of "ko:대한민국 6군단"
There are no outlinks in zh version of "ko:1공수특전여단"
There are no outlinks in zh version of "ko:대한민국 2작전사령부"
There are no outlinks in en version of "wiktionary:혁명"
There are no outlinks in en version of "wiktionary:정변"
There are no outlinks in en version of "wiktionary:군사"
...Starting interlanguage link retrieval for 1966 Nigerian counter-coup
...Starting interlanguage link retrieval for 1963 South Vietnamese coup
...Starting interlanguage link retrieval for 1964 South Vietnamese coup
...Starting interlanguage link retrieval for 1965 South Vietnamese coup
...Starting interlanguage link retrieval for Buddhist Uprising
...Starting interlanguage link retrieval for December 1964 South Vietnamese coup
...Starting interlanguage link retrieval for September 1964 South Vietnamese coup attempt
...Starting interlanguage link retrieval for 1960 South Vietnamese coup attempt
There are no outlinks in zh version of "vi:Lữ đoàn Liên binh Phòng vệ Tổng thống Phủ"
...Starting interlanguage link retrieval for Tacnazo insurrection
...Starting interlanguage link retrieval for 1966 Upper Voltan coup d'état
...Starting interlanguage link retrieval for 1971 Moroccan coup attempt
There are no outlinks in en version of "fr:Henri Dubois-Roquebert"
...Starting interlanguage link retrieval for 1971 Sudanese coup d'état
...Starting interlanguage link retrieval for 1972 Moroccan coup attempt
...Starting interlanguage link retrieval for 1972 Republic of the Congo coup d'état attempt
...Starting interlanguage link retrieval for 1976 Afghanistan attempted coup
...Starting interlanguage link retrieval for 1976 Burundian coup d'état
...Starting interlanguage link retrieval for 1977 Sudan Juba coup d'état attempt
...Starting interlanguage link retrieval for 1979 Equatorial Guinea coup d'état
...Starting interlanguage link retrieval for Saur Revolution
...Starting interlanguage link retrieval for 1976 Argentine coup d'état
...Starting interlanguage link retrieval for Cambodian coup of 1970
...Starting interlanguage link retrieval for Carnation Revolution
There are no outlinks in be-x-old version of "pt:César Moreira Baptista"
There are no outlinks in be-x-old version of "pt:Paulo de Carvalho"
There are no outlinks in be-x-old version of "pt:Verão Quente de 1975"
There are no outlinks in be-x-old version of "pt:Rui Patrício (empresário)"
There are no outlinks in be-x-old version of "pt:Direção-Geral de Segurança"
There are no outlinks in fr version of "pt:Vasco Lourenço"
There are no outlinks in it version of "v:Scrittori portoghesi: La liberazione in Africa e in Portogallo"
There are no outlinks in it version of "v:"
There are no outlinks in es version of "m:w:fr:Révolution des œillets"
There are no outlinks in es version of "m:w:pt:Revolução dos Cravos"
...Starting interlanguage link retrieval for 1975 Chadian coup d'état
...Starting interlanguage link retrieval for 1973 Chilean coup d'état
There are no outlinks in en version of "wikisource:Salvador Allende's Last Speech"
...Starting interlanguage link retrieval for Clockwork Orange (plot)
...Starting interlanguage link retrieval for 1974 Cypriot coup d'état
...Starting interlanguage link retrieval for Ecclesiastical coup
...Starting interlanguage link retrieval for Operation Green Sea
...Starting interlanguage link retrieval for 1975 Honduran coup d'état
...Starting interlanguage link retrieval for 1978 Honduran coup d'état
...Starting interlanguage link retrieval for June 4th revolution in Ghana
...Starting interlanguage link retrieval for Operation Fair Play
...Starting interlanguage link retrieval for Project 571
...Starting interlanguage link retrieval for Coup d'état of December Twelfth
There are no outlinks in zh version of "ko:이건영 (1926년)"
There are no outlinks in zh version of "ko:수도방위사령부"
There are no outlinks in zh version of "ko:정병주"
There are no outlinks in zh version of "ko:하나회"
There are no outlinks in zh version of "ko:정호용"
...Starting interlanguage link retrieval for 1973 Uruguayan coup d'état
...Starting interlanguage link retrieval for 1980 Liberian coup d'état
...Starting interlanguage link retrieval for 1980 Surinamese coup d'état
There are no outlinks in en version of "nl:Frank Wijngaarde"
...Starting interlanguage link retrieval for 1980 Turkish coup d'état
...Starting interlanguage link retrieval for 1981 Central African Republic coup d'état
...Starting interlanguage link retrieval for 1987 Burundian coup d'état
...Starting interlanguage link retrieval for 1987 Republic of the Congo coup d'état attempt
...Starting interlanguage link retrieval for 1987 Tunisian coup d'état
There are no outlinks in en version of "fr:Constitution tunisienne de 1959"
There are no outlinks in en version of "fr:Gazoduc"
There are no outlinks in en version of "fr:Nicolas Beau"
There are no outlinks in en version of "it:Commissione parlamentare"
There are no outlinks in en version of "it:Fulvio Martini"
...Starting interlanguage link retrieval for 1989 Sudanese coup d'état
...Starting interlanguage link retrieval for 1981 Bahraini coup d'état attempt
...Starting interlanguage link retrieval for 1980 Bolivian coup d'état
...Starting interlanguage link retrieval for 1989 Burkinabé coup d'état attempt
...Starting interlanguage link retrieval for 1984 Cameroonian coup attempt
...Starting interlanguage link retrieval for Coup d'état of May Seventeenth
There are no outlinks in zh version of "ko:국가보위비상대책위원회"
There are no outlinks in zh version of "ko:이희성 (1924년)"
...Starting interlanguage link retrieval for 1987 Fijian coups d'état
...Starting interlanguage link retrieval for June 1988 Haitian coup d'état
...Starting interlanguage link retrieval for September 1988 Haitian coup d'état
...Starting interlanguage link retrieval for Nojeh coup plot
...Starting interlanguage link retrieval for 1982 Kenyan coup d'état attempt
...Starting interlanguage link retrieval for 1988 Maldives coup d'état
...Starting interlanguage link retrieval for 1989 Panamanian coup d'état attempt
...Starting interlanguage link retrieval for People Power Revolution
...Starting interlanguage link retrieval for 1986–90 Philippine coup attempts
...Starting interlanguage link retrieval for 1989 Philippine coup attempt
...Starting interlanguage link retrieval for Romanian Revolution
There are no outlinks in nn version of "sv:Rumänska revolutionen 1989"
There are no outlinks in nn version of "sv:Huvudsida"
There are no outlinks in de version of "ro:Lista cărţilor referitoare la revoluţia română"
There are no outlinks in ja version of "ro:Partidul Socialist al Muncii"
There are no outlinks in zh version of "ja:1989年のルーマニア革命に関する書籍一覧"
...Starting interlanguage link retrieval for 1981 Seychelles coup d'état attempt
...Starting interlanguage link retrieval for 1982 Spanish coup d'état attempt
...Starting interlanguage link retrieval for 1980 Upper Voltan coup d'état
...Starting interlanguage link retrieval for 1982 Upper Voltan coup d'état
...Starting interlanguage link retrieval for 1983 Upper Voltan coup d'état attempt
...Starting interlanguage link retrieval for 1991 Lesotho coup d'état
...Starting interlanguage link retrieval for 1991 Malian coup d'état
...Starting interlanguage link retrieval for 1991 Thai coup d'état
...Starting interlanguage link retrieval for 1994 Gambian coup d'état
...Starting interlanguage link retrieval for 1995 Azerbaijani coup d'état attempt
...Starting interlanguage link retrieval for 1995 Pakistani coup d'état attempt
...Starting interlanguage link retrieval for 1996 Bangladesh coup d'état attempt
...Starting interlanguage link retrieval for 1994 Bophuthatswana crisis
...Starting interlanguage link retrieval for Coup of the Volunteers
...Starting interlanguage link retrieval for 1993 Azeri coup d'état
There are no outlinks in pt version of "fr:Coup d'État de 1993 en Azerbaïdjan"
...Starting interlanguage link retrieval for 1993 Guatemalan constitutional crisis
...Starting interlanguage link retrieval for 1991 Haitian coup d'état
...Starting interlanguage link retrieval for 1999 Ivorian coup d'état
There are no outlinks in pt version of "fr:Coup d'État de 1999 en Côte d'Ivoire"
...Starting interlanguage link retrieval for 1999 Pakistani coup d'état
...Starting interlanguage link retrieval for 1992 Peruvian constitutional crisis
...Starting interlanguage link retrieval for 1993 Russian constitutional crisis
There are no outlinks in en version of "ru:Гуслянников, Василий Дмитриевич"
There are no outlinks in be version of "ru:Советы"
...Starting interlanguage link retrieval for 1992 Sierra Leonean coup d'état
...Starting interlanguage link retrieval for 1991 Soviet coup d'état attempt
There are no outlinks in ja version of "ru:Форос"
There are no outlinks in fr version of "ru:Государственный комитет по чрезвычайному положению"
There are no outlinks in zh version of "ru:Карпухин, Виктор Фёдорович"
There are no outlinks in zh version of "ru:Лебедь, Александр Иванович"
There are no outlinks in zh version of "ru:Агеев, Гений Евгеньевич"
There are no outlinks in zh version of "ru:Усов, Владимир Александрович"
There are no outlinks in zh version of "ru:Кричевский, Илья Маратович"
There are no outlinks in zh version of "ru:Лефортовская тюрьма"
There are no outlinks in zh version of "ru:Лукьянов, Анатолий Иванович"
There are no outlinks in zh version of "ru:Тизяков, Александр Иванович"
There are no outlinks in zh version of "ru:4-я гвардейская танковая дивизия"
There are no outlinks in zh version of "ru:Комарь, Дмитрий Алексеевич"
There are no outlinks in zh version of "ru:Оле́г Дми́триевич Бакла́нов"
There are no outlinks in zh version of "ru:Болдин, Валерий Иванович"
There are no outlinks in zh version of "ru:Кобец, Константин Иванович"
There are no outlinks in zh version of "ru:Пономарёв, Лев Александрович"
There are no outlinks in en version of "ru:Тизяков, Александр Иванович"
There are no outlinks in en version of "ru:Болдин, Валерий Иванович"
...Starting interlanguage link retrieval for 1993 alleged Turkish military coup
...Starting interlanguage link retrieval for 1992 Venezuelan coup d'état attempts
...Starting interlanguage link retrieval for 2001 Burundian coup d'état attempt
...Starting interlanguage link retrieval for 2001 Central African Republic coup d'état attempt
...Starting interlanguage link retrieval for 2003 Central African Republic coup d'état
...Starting interlanguage link retrieval for 2004 Haitian coup d'état
There are no outlinks in ru version of "fr:Buteur Métayer"
There are no outlinks in ru version of "fr:Amiot Métayer"
...Starting interlanguage link retrieval for 2005 Mauritanian coup d'état
...Starting interlanguage link retrieval for 2003 São Tomé and Príncipe coup d'état
...Starting interlanguage link retrieval for 2006 Fijian coup d'état
...Starting interlanguage link retrieval for 2006 Thai coup d'état
...Starting interlanguage link retrieval for 2006–08 Bangladeshi political crisis
...Starting interlanguage link retrieval for 2003 Burkinabé coup d'état attempt
...Starting interlanguage link retrieval for 2004 Chadian coup d'état attempt
...Starting interlanguage link retrieval for 2006 Chadian coup d'état attempt
...Starting interlanguage link retrieval for 2000 Ecuadorian coup d'état
...Starting interlanguage link retrieval for Second EDSA Revolution
...Starting interlanguage link retrieval for 2004 Equatorial Guinea coup d'état attempt
...Starting interlanguage link retrieval for 2000 Fijian coup d'état
...Starting interlanguage link retrieval for 2008 Guinean coup d'état
...Starting interlanguage link retrieval for 2009 Honduran coup d'état
...Starting interlanguage link retrieval for Tulip Revolution
...Starting interlanguage link retrieval for 2006 Malagasy coup d'état attempt
...Starting interlanguage link retrieval for 2008 Mauritanian coup d'état
...Starting interlanguage link retrieval for 2002 Venezuelan coup d'état attempt
...Starting interlanguage link retrieval for List of coups d'état and coup attempts since 2010
...Starting interlanguage link retrieval for 2010 Madagascar coup d'état attempt
...Starting interlanguage link retrieval for 2011 Bangladesh coup d'état attempt
...Starting interlanguage link retrieval for 2011 Democratic Republic of the Congo coup d'état attempt
...Starting interlanguage link retrieval for 2012 Guinea-Bissau coup d'état
...Starting interlanguage link retrieval for 2012 Malian coup d'état
...Starting interlanguage link retrieval for 2013 Libyan coup d'état attempt
...Starting interlanguage link retrieval for 2014 Burkinabé uprising
There are no outlinks in uk version of "fr:Zéphirin Diabré"
There are no outlinks in ru version of "fr:Zéphirin Diabré"
...Starting interlanguage link retrieval for 2014 Lesotho political crisis
...Starting interlanguage link retrieval for 2014 Libyan coup d'état attempts
...Starting interlanguage link retrieval for 2014 Thai coup d'état
There are no outlinks in zh version of "th:กองอำนวยการรักษาความสงบเรียบร้อย"
There are no outlinks in zh version of "th:วินธัย สุวารี"
...Starting interlanguage link retrieval for 2015 Burundian coup d'état attempt
...Starting interlanguage link retrieval for Abkhazian Revolution
...Starting interlanguage link retrieval for 2015 Burkinabé coup d'état
There are no outlinks in es version of "fr:Coup d'État de 2015 au Burkina Faso"
...Starting interlanguage link retrieval for 2010 Ecuador crisis
...Starting interlanguage link retrieval for 2013 Egyptian coup d'état
...Starting interlanguage link retrieval for 2014 Gambian coup d'état attempt
...Starting interlanguage link retrieval for 2010 Nigerien coup d'état
There are no outlinks in ja version of "w:Albadé Abouba"
There are no outlinks in ja version of "w:Nigerien Alliance for Democracy and Progress"
There are no outlinks in ja version of "w:Ali Lamine Zeine"
Error on retrieving interlanguage link usage for "2010 Nigerien coup d'état"
...Starting interlanguage link retrieval for 2016 Turkish coup d'état attempt
There are no outlinks in it version of "q:Colpo di Stato in Turchia del 2016"
There are no outlinks in it version of "q:"
There are no outlinks in en version of "de:Rainer Hermann"

Scrape file image usage

 
file_image_usage = {}

for page in page_title_list:
    try:
        image_usage = get_interlanguage_image_usage(page_title=page,bad_images=bad_images)
        
        file_image_usage[page] = image_usage
        
        #with open('imageusage-{0}.json'.format(page.replace(' ','_')),'w') as f:
        #    json.dump(image_usage,f)
        
    except KeyboardInterrupt:
        raise
        
    except:
        print("Error on {0}".format(page))
        pass
    
with open('all_coups_file_image_usage.json','w') as f:
    json.dump(file_image_usage,f)

Pull images

with open('all_coups_file_image_usage.json','r') as f:
    file_image_usage = json.load(f)

full_image_list = list()
for article, article_payload in file_image_usage.items():
    for lang,file_payload in article_payload.items():
        for filename,pagelist in file_payload.items():
            if filename not in [filename for (lang,filename) in full_image_list]:
                full_image_list.append((lang,filename))
                
print("There are {0} images in the full_image_list".format(len(full_image_list)))

full_image_list[:10]
There are 1537 images in the full_image_list
[('ja', 'ファイル:Ngo Dinh Diem - Thumbnail - ARC 542189.png'),
 ('ja', 'ファイル:Dinh doc lap bi doi bom.jpg'),
 ('ko', '파일:Diem dead.jpg'),
 ('ko', '파일:Ngo Dinh Diem - Thumbnail - ARC 542189.png'),
 ('id', 'Berkas:Ngo Dinh Diem - Thumbnail - ARC 542189.png'),
 ('en', 'File:Nhà thờ Cha Tam.jpg'),
 ('en', 'File:LBJ nhu.jpg'),
 ('en', 'File:Diem dead.jpg'),
 ('en', 'File:Nguyen Van Thieu with map (cropped).jpg'),
 ('en', 'File:Presidential Standard of South Vietnam (1955-1963).png')]
raw = requests.get('https://en.wikipedia.org/wiki/File:Reform the Armed Forces Movement logo circa 1990s.png', headers = header).text
soup = BeautifulSoup(raw,'lxml')
image_src = [i['src'] for i in soup.find_all('img') if 'File' in i['alt']][0]
image_src
#urlretrieve(image_src, './Images/{0}'.format(defiled_filename))
'//upload.wikimedia.org/wikipedia/en/d/de/Reform_the_Armed_Forces_Movement_logo_circa_1990s.png'
raw = requests.get("https://tr.wikipedia.org/wiki/Dosya:Chavez başarısız darbeden sonra göreve dönerken 13 Nisan 2002.jpg", headers = header).text
soup = BeautifulSoup(raw,'lxml')
image_src = [i['src'] for i in soup.find_all('img')]
image_src[0] == quote("Chavez başarısız darbeden sonra göreve dönerken 13 Nisan 2002.jpg")
False
quote("Chavez başarısız darbeden sonra göreve dönerken 13 Nisan 2002.jpg")
'Chavez%20ba%C5%9Far%C4%B1s%C4%B1z%20darbeden%20sonra%20g%C3%B6reve%20d%C3%B6nerken%2013%20Nisan%202002.jpg'
image_src#[0].split('/')[-1]
['//upload.wikimedia.org/wikipedia/tr/3/36/Chavez_ba%C5%9Far%C4%B1s%C4%B1z_darbeden_sonra_g%C3%B6reve_d%C3%B6nerken_13_Nisan_2002.jpg',
 '//upload.wikimedia.org/wikipedia/commons/thumb/c/c3/NotCommons-emblem-copyrighted.svg/52px-NotCommons-emblem-copyrighted.svg.png',
 '//upload.wikimedia.org/wikipedia/commons/thumb/3/34/Ambox_warning_blue.svg/50px-Ambox_warning_blue.svg.png',
 '//upload.wikimedia.org/wikipedia/tr/thumb/3/36/Chavez_ba%C5%9Far%C4%B1s%C4%B1z_darbeden_sonra_g%C3%B6reve_d%C3%B6nerken_13_Nisan_2002.jpg/120px-Chavez_ba%C5%9Far%C4%B1s%C4%B1z_darbeden_sonra_g%C3%B6reve_d%C3%B6nerken_13_Nisan_2002.jpg',
 '//tr.wikipedia.org/wiki/Special:CentralAutoLogin/start?type=1x1',
 '/static/images/wikimedia-button.png',
 '/static/images/poweredby_mediawiki_88x31.png']
len(set([i[1].split(':')[1] for i in full_image_list]))
747
header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}

current_images = os.listdir('./Images/')

file_errors = list()
for (lang,filename) in full_image_list:
    defiled_filename = filename.split(':')[1]
    # Try to get the image from Commons first
    if defiled_filename not in current_images:
        try:
            # Remove file prefix because everything on Commons starts with File
            raw = requests.get('https://commons.wikimedia.org/wiki/File:' + defiled_filename, headers = header).text
            soup = BeautifulSoup(raw,'lxml')
            image_src = [i['src'] for i in soup.find_all('img') if 'File' in i['alt']][0]
            urlretrieve(image_src, './Images/{0}'.format(defiled_filename));
        except KeyboardInterrupt:
            raise
        except:
            # Get the file from the local Wikipedia edition
            try:
                raw = requests.get('https://{0}.wikipedia.org/wiki/{1}'.format(lang,filename), headers = header).text
                soup = BeautifulSoup(raw,'lxml')
                for img in soup.find_all('img'):
                    if quote(defiled_name) in img:
                        image_src = 'https:' + img
                        urlretrieve(image_src, './Images/{0}'.format(defiled_filename));
            except:
                # Something is really broken
                print("Error on \"{0}\" in {1}".format(filename,lang))
                file_errors.append((lang,filename))
            pass
Error on "File:1992 Venezuelan coup d'état attempts Hugo Chávez speech.jpg" in en
Error on "Dosya:Pedro Carmona 2002 darbesinden sonra yemin töreninde.jpg" in tr
Error on "Dosya:Chavez başarısız darbeden sonra göreve dönerken 13 Nisan 2002.jpg" in tr
Error on "Dosya:Genesis FUBELT.jpg" in tr
Error on "Şəkil:Salvador Alyendenin son foto şəkili.jpg" in az
Error on "Dosya:Plevne Marşı pul.jpg" in tr
Error on "Dosya:MBKTürkeş.jpg" in tr
Error on "Dosya:27 Mayıs Darbesi sembolü.png" in tr
Error on "Dosya:General el Sisi Mursiyi görevden alındığını açıklarken.png" in tr
Error on "Файл:Казнь Касема.jpg" in ru
Error on "ملف:Taher yahya.jpg" in ar
Error on "ملف:Bakr.jpg" in ar
Error on "ملف:Mhdawi.jpg" in ar
Error on "ملف:Last day qasim.jpg" in ar
Error on "ملف:صورة ذبابات الإنقلابيين العسكريين التركيين.jpg" in ar
Error on "ملف:مذيعة قناة تي آر تي.jpg" in ar
Error on "ملف:صورة دبابات الإنقلابيين العسكريين التركيين.jpg" in ar
Error on "ملف:Trial iraq.jpg" in ar
Error on "Dosya:Mayıs 1981 Kenan Evren Ankaragücü kaptanı Adil Eriç'e Türkiye Kupası'nı veriyor.jpg" in tr
Error on "Dosya:Newsweek 22 Eylül 1980 kapak.jpg" in tr
Error on "Berkas:BP200906-2.jpg" in id
Error on "Berkas:The nation 200506.jpg" in id
Error on "चित्र:Amadou Toumani Touré 30 नवम्बर 2011.jpg" in hi
Error on "Berkas:Revolutie-strada-multime.jpg" in id
Error on "ملف:Watanyin.jpg" in ar
Error on "ملف:Defence.jpg" in ar
Error on "ملف:RafiqArif.jpg" in ar
Error on "ملف:July1.jpg" in ar
Error on "ملف:Qasim2.jpg" in ar
Error on "ملف:Sallama.png" in ar
Error on "ملف:Rafiq1.jpg" in ar
Error on "ملف:Qasim arif.jpg" in ar
Error on "ไฟล์:Burkinabé uprising at Place de la Nation.jpg" in th
Error on "Fájl:Operacio marverde.jpg" in hu
Error on "Файл:24 жніўня 1991.jpg" in be-x-old
Error on "Attēls:LRAP un MP 1991. gada 19. augusta paziņojums.jpg" in lv
Error on "Attēls:Konstitucionālais likums Par Latvijas Republikas valstisko statusu.jpg" in lv
Error on "Attēls:PSRS Valsts padomes lēmums par Latvijas Republikas neatkarības atzīšanu.jpg" in lv
Error on "Dosya:1993 Rusya anayasa krizi sırasında Parlamento yanlısı göstericiler.jpg" in tr
Error on "Dosya:4 Ekim 1993 Rusya Parlamento binası alevler içinde.jpg" in tr
Error on "ファイル:ベールイドーム.jpg" in ja
Error on "Ficheiro:Ülkü Ocakları Eğitim ve Kültür Vakfı Logosu.png" in pt
Error on "ملف:Hameed.png" in ar
Error on "Vaizdas:V. Landsbergis. Viešoji informacija.jpg" in lt

I copied the errors from image downloading to "image_download_errors.txt" to download these manually. These errors are a function of the image not existing on Wikimedia Commons, but only on the local language edition.

with open('image_download_errors.txt','r') as f:
    errors = f.readlines()
    
errors = sorted(list(set([e for e in errors])))

with open('image_download_errors_cleaned.txt','w') as f:
    f.writelines(errors)

Scrape revisions

revisions_df_dict = {}

for page in page_title_list:
    try:
        extlink_usage = get_interlanguage_revisions(page_title=page)
        revisions_df_dict[page] = extlink_usage
        
    except KeyboardInterrupt:
        raise
        
    except:
        print("Error on {0}".format(page))
        pass
    
all_revisions_df = pd.concat(revisions_df_dict.values(),keys=revisions_df_dict.keys(),
                             names=['parent_topic']).reset_index(level=0)

all_revisions_df.to_csv('all_coups_revisions.csv',encoding='utf8',index=False)
Error getting revisions in fa version of "کودتای ۲۸ مرداد"
Error getting revisions in tr version of "1958 Irak Darbesi"
Error getting revisions in ar version of "حركة تموز 1958"
Error getting revisions in tr version of "27 Mayıs Darbesi"
Error getting revisions in pt version of "Golpe de Estado no Brasil em 1964"
Error getting revisions in ko version of "5·16 군사 정변"
Error getting revisions in vi version of "Đảo chính Việt Nam Cộng hòa 1963"
Error getting revisions in de version of "Militärputsch in Südvietnam 1960"
Error getting revisions in pt version of "Revolução de 25 de Abril de 1974"
Error getting revisions in fr version of "Coup d'État du 11 septembre 1973 au Chili"
Error getting revisions in es version of "Golpe de Estado en Chile de 1973"
Error getting revisions in tr version of "12 Eylül Darbesi"
Error getting revisions in ro version of "Revoluția Română din 1989"
Error getting revisions in de version of "Rumänische Revolution 1989"
Error getting revisions in ru version of "События сентября — октября 1993 года в Москве"
Error getting revisions in ru version of "Августовский путч"
Error getting revisions in th version of "รัฐประหารในประเทศไทย พ.ศ. 2549"
Error getting revisions in es version of "Golpe de Estado en Tailandia de 2006"
Error getting revisions in es version of "Golpe de Estado en Honduras de 2009"
Error getting revisions in fr version of "Coup d'État de 2009 au Honduras"
Error getting revisions in es version of "Golpe de Estado en Mauritania en 2008"
Error getting revisions in es version of "Golpe de Estado en Venezuela de 2002"
Error getting revisions in th version of "รัฐประหารในประเทศไทย พ.ศ. 2557"
Error getting revisions in es version of "30S (Ecuador)"
Error getting revisions in de version of "Militärputsch in Ägypten 2013"
Error getting revisions in tr version of "2016 Türkiye askerî darbe girişimi"
Error getting revisions in de version of "Putschversuch in der Türkei 2016"
Error getting revisions in fr version of "Tentative de coup d'État de 2016 en Turquie"
Error getting revisions in ru version of "Попытка военного переворота в Турции (2016)"
external_links_usage = {}

for page in page_title_list:
    try:
        extlink_usage = get_interlanguage_extlinks_usage(page_title=page)
        
        external_links_usage[page] = extlink_usage
        
        #with open('imageusage-{0}.json'.format(page.replace(' ','_')),'w') as f:
        #    json.dump(image_usage,f)
        
    except KeyboardInterrupt:
        raise
        
    except:
        print("Error on {0}".format(page))
        pass
    
with open('all_coups_extlinks_usage.json','w') as f:
    json.dump(external_links_usage,f)
Error getting extlinks in es version of "Anexo:Cronología del golpe de Estado de 1953 (Colombia)"
Error getting extlinks in ar version of "انقلاب كولومبيا 1953"
Error getting extlinks in eu version of "1954ko Guatemalako estatu kolpea"
Error getting extlinks in nl version of "Operatie PBSUCCESS"
Error getting extlinks in eo version of "Operaco PBSUCCESS"
Error getting extlinks in ar version of "انقلاب غواتيمالا 1954"
Error getting extlinks in sv version of "Operation Ajax"
Error getting extlinks in bg version of "Държавен преврат в Иран (1953)"
Error getting extlinks in az version of "Ayaks əməliyyatı"
Error getting extlinks in fa version of "کودتای ضدسلطنتی عراق"
Error getting extlinks in pl version of "Zamieszki w Mosulu (1959)"
Error getting extlinks in ar version of "ثورة الشواف"
Error getting extlinks in pt version of "Crise de Maio de 1958"
Error getting extlinks in de version of "Putsch d’Alger (1958)"
Error getting extlinks in ur version of "راولپنڈی سازش"
Error getting extlinks in ru version of "Заговор в Равалпинди"
Error getting extlinks in en version of "1954 Syrian coup d'état"
Error getting extlinks in ru version of "Государственный переворот в Сирии (1954)"
Error getting extlinks in ar version of "الانقلاب العسكري في سوريا 1954"
Error getting extlinks in en version of "Silent Coup (Thailand)"
Error getting extlinks in en version of "1958 Venezuelan coup d'état"
Error getting extlinks in es version of "Golpe de Estado en Venezuela de 1958"
Error getting extlinks in fi version of "Turkin vallankaappaus 1960"
Error getting extlinks in pl version of "Zamach stanu w Syrii (1963)"
Error getting extlinks in fr version of "Coup d'État de 1963 en Syrie"
Error getting extlinks in pt version of "Revolução de 8 de Março"
Error getting extlinks in de version of "Revolution des 8. März"
Error getting extlinks in ar version of "الانقلاب العسكري في توغو 1963"
Error getting extlinks in en version of "1966 Republic of the Congo coup d'état attempt"
Error getting extlinks in pl version of "Zamach stanu w Syrii (1966)"
Error getting extlinks in ar version of "الانقلاب العسكري في سوريا 1966"
Error getting extlinks in ja version of "1966年シリアクーデター"
Error getting extlinks in zh version of "利比亚绿色革命"
Error getting extlinks in sv version of "Generalkuppen i Alger"
Error getting extlinks in cy version of "Putsch y Cadfridogion"
Error getting extlinks in gl version of "Revolución Arxentina"
Error getting extlinks in eo version of "Argentina Revolucio"
Error getting extlinks in he version of "הפיכת 1964 (ברזיל)"
Error getting extlinks in sh version of "Državni udar u Brazilu 1964."
Error getting extlinks in fr version of "Coup d'État de la Saint-Sylvestre"
Error getting extlinks in pl version of "Zamach stanu w Etiopii (1960)"
Error getting extlinks in pt version of "Golpe de Estado na Etiópia em 1960"
Error getting extlinks in ru version of "Государственный переворот в Габоне (1964)"
Error getting extlinks in uk version of "Державний переворот у Габоні (1964)"
Error getting extlinks in ar version of "ثورة 17 تموز 1968"
Error getting extlinks in pl version of "Zamach stanu w Iraku (1968)"
Error getting extlinks in de version of "Militärputsch vom 18. November 1963"
Error getting extlinks in ja version of "1963年11月イラククーデター"
Error getting extlinks in no version of "8. februar-revolusjonen"
Error getting extlinks in pl version of "Zamach stanu w Iraku (luty 1963)"
Error getting extlinks in ja version of "ラマダーン革命"
Error getting extlinks in br version of "Taol-Stad ar 16 a viz Mae"
Error getting extlinks in fr version of "Coup d'État du 16 mai"
Error getting extlinks in th version of "รัฐประหาร 16 พฤษภาคม"
Error getting extlinks in pt version of "Golpe de Estado de 16 de Maio"
Error getting extlinks in es version of "Golpe de Estado del 16 de mayo"
Error getting extlinks in en version of "1966 Nigerian counter-coup"
Error getting extlinks in pt version of "Contragolpe na Nigéria em 1966"
Error getting extlinks in ca version of "Cop d'estat a Nigèria de juliol de 1966"
Error getting extlinks in ko version of "1963년 남베트남 군사 쿠데타"
Error getting extlinks in cy version of "Coup d'état De Fietnam, 1963"
Error getting extlinks in vi version of "Cuộc chỉnh lý tại Việt Nam Cộng hòa 1964"
Error getting extlinks in de version of "Militärputsch in Südvietnam 1960"
Error getting extlinks in pt version of "Golpe de Estado no Vietnã do Sul em 1960"
Error getting extlinks in tr version of "1960 Güney Vietnam darbe girişimi"
Error getting extlinks in ca version of "Cop d'Estat de Vietnam del Sud de 1960"
Error getting extlinks in es version of "Golpe de Estado en Vietnam del Sur de 1960"
Error getting extlinks in en version of "1966 Upper Voltan coup d'état"
Error getting extlinks in pt version of "Golpe de Estado em Alto Volta em 1966"
Error getting extlinks in uk version of "Переворот у Республіці Верхня Вольта (1966)"
Error getting extlinks in en version of "1972 Republic of the Congo coup d'état attempt"
Error getting extlinks in nl version of "Saur-Revolutie"
Error getting extlinks in lv version of "Aprīļa revolūcija Afganistānā"
Error getting extlinks in de version of "Saurrevolution"
Error getting extlinks in no version of "Saur-revolusjonen"
Error getting extlinks in ko version of "1976년 아르헨티나 쿠데타"
Error getting extlinks in wa version of "Revintreye ås djalofrenes"
Error getting extlinks in ko version of "카네이션 혁명"
Error getting extlinks in sr version of "Каранфилска револуција"
Error getting extlinks in ar version of "ثورة القرنفل"
Error getting extlinks in nn version of "Nellikrevolusjonen"
Error getting extlinks in oc version of "Revolucion dels ulhets"
Error getting extlinks in uk version of "Революція гвоздик"
Error getting extlinks in no version of "Nellikrevolusjonen"
Error getting extlinks in bg version of "Революция на карамфилите"
Error getting extlinks in he version of "מהפכת הציפורנים"
Error getting extlinks in hr version of "Revolucija karanfila"
Error getting extlinks in da version of "Nellikerevolutionen"
Error getting extlinks in tet version of "Revolusaun Kravu nian"
Error getting extlinks in is version of "Nellikubyltingin"
Error getting extlinks in lv version of "Neļķu revolūcija"
Error getting extlinks in br version of "Dispac'h ar Jenofl"
Error getting extlinks in lmo version of "Rivoluzzion di Garofoi"
Error getting extlinks in en version of "1973 Chilean coup d'état"
Error getting extlinks in vi version of "Đảo chính năm 1973 tại Chile"
Error getting extlinks in br version of "Taol-stad Gwengolo 1973 e Chile"
Error getting extlinks in id version of "Kudeta Chili 1973"
Error getting extlinks in pl version of "Zamach stanu na Cyprze"
Error getting extlinks in en version of "Ecclesiastical coup"
Error getting extlinks in pl version of "Kryzys kościelny (Cypr)"
Error getting extlinks in it version of "Operazione mare verde"
Error getting extlinks in fi version of "Vihreän meren operaatio"
Error getting extlinks in en version of "1975 Honduran coup d'état"
Error getting extlinks in en version of "1978 Honduran coup d'état"
Error getting extlinks in ru version of "Военный переворот в Пакистане (1977)"
Error getting extlinks in hr version of "Državni udar u Urugvaju 1973."
Error getting extlinks in es version of "El golpe de los sargentos"
Error getting extlinks in ka version of "თურქეთის სახელმწიფო გადატრიალება (1980)"
Error getting extlinks in az version of "12 sentyabr hərbi çevrilişi"
Error getting extlinks in en version of "1981 Central African Republic coup d'état"
Error getting extlinks in en version of "1987 Republic of the Congo coup d'état attempt"
Error getting extlinks in en version of "1980 Bolivian coup d'état"
Error getting extlinks in zh version of "5·17緊急戒嚴"
Error getting extlinks in ja version of "5・17非常戒厳令拡大措置"
Error getting extlinks in th version of "รัฐประหารในฟีจี พ.ศ. 2530"
Error getting extlinks in si version of "1988 මාලදිවයින් රාජ්‍ය විරෝධී කුමන්ත්‍රණය"
Error getting extlinks in mr version of "ऑपरेशन कॅक्टस"
Error getting extlinks in da version of "EDSA Revolutionen"
Error getting extlinks in ko version of "피플 파워 혁명"
Error getting extlinks in br version of "Dispac'h an EDSA"
Error getting extlinks in war version of "Rebolusyon EDSA hadton 1986"
Error getting extlinks in id version of "Revolusi EDSA"
Error getting extlinks in ilo version of "Rebolusion ti Bileg ti Tattao"
Error getting extlinks in ceb version of "Rebolusyong EDSA sa 1986"
Error getting extlinks in en version of "1986–90 Philippine coup attempts"
Error getting extlinks in hr version of "Rumunjska revolucija 1989."
Error getting extlinks in cs version of "Rumunská revoluce"
Error getting extlinks in sc version of "Rebolussione rumena"
Error getting extlinks in simple version of "Romanian Revolution of 1989"
Error getting extlinks in pt version of "Revolução Romena de 1989"
Error getting extlinks in sl version of "Romunska revolucija 1989"
Error getting extlinks in ga version of "Réabhlóid na Rómáine 1989"
Error getting extlinks in eo version of "Rumana Revolucio de 1989"
Error getting extlinks in tl version of "Himagsikang Rumano ng 1989"
Error getting extlinks in sr version of "Румунска револуција 1989."
Error getting extlinks in en version of "1982 Spanish coup d'état attempt"
Error getting extlinks in ca version of "Conspiració colpista per al 27 d'octubre de 1982"
Error getting extlinks in en version of "1980 Upper Voltan coup d'état"
Error getting extlinks in pt version of "Golpe de Estado em Alto Volta em 1980"
Error getting extlinks in pt version of "Golpe de Estado em Alto Volta em 1982"
Error getting extlinks in en version of "1983 Upper Voltan coup d'état attempt"
Error getting extlinks in pt version of "Tentativa de golpe de Estado em Alto Volta em 1983"
Error getting extlinks in fr version of "Coup d'État de 1991 au Mali"
Error getting extlinks in en version of "1991 Thai coup d'état"
Error getting extlinks in ru version of "Переворот добровольцев"
Error getting extlinks in en version of "1993 Azeri coup d'état"
Error getting extlinks in en version of "1993 Guatemalan constitutional crisis"
Error getting extlinks in pt version of "Crise constitucional na Guatemala em 1993"
Error getting extlinks in ms version of "Rampasan kuasa tentera Pakistan 1999"
Error getting extlinks in ur version of "پاکستان میں فوجی تاخت 1999ء"
Error getting extlinks in ja version of "アウトゴルペ"
Error getting extlinks in ko version of "1993년 러시아 체제 위기"
Error getting extlinks in fi version of "Venäjän perustuslaillinen kriisi 1993"
Error getting extlinks in ka version of "რუსეთის 1993 წლის კონსტიტუციური კრიზისი"
Error getting extlinks in bg version of "Руска конституционна криза (1992 – 1993)"
Error getting extlinks in th version of "วิกฤตการณ์รัฐธรรมนูญรัสเซีย พ.ศ. 2536"
Error getting extlinks in nl version of "Russische constitutionele crisis van 1993"
Error getting extlinks in ja version of "10月政変"
Error getting extlinks in mn version of "Зөвлөлтийн 1991 оны төрийн эргэлт хийх оролдлого"
Error getting extlinks in sr version of "Августовски пуч"
Error getting extlinks in ky version of "Август кризиси 1991"
Error getting extlinks in zh-yue version of "八月政變"
Error getting extlinks in zh version of "八一九事件"
Error getting extlinks in da version of "Augustkuppet"
Error getting extlinks in kk version of "Тамыз бүлігі"
Error getting extlinks in lmo version of "Putsch de Agost"
Error getting extlinks in es version of "Intento de golpe de Estado en la Unión Soviética"
Error getting extlinks in fr version of "Tentative de coup d'État de 1992 au Venezuela"
Error getting extlinks in ca version of "Cop d'estat de febrer de 1992 a Veneçuela"
Error getting extlinks in fi version of "Haitin kapina 2004"
Error getting extlinks in et version of "2004. aasta Haiti mäss"
Error getting extlinks in no version of "Statskuppet i Mauritania 2005"
Error getting extlinks in pl version of "Zamach stanu na Fidżi (2006)"
Error getting extlinks in fi version of "Fidžin vallankaappaus 2006"
Error getting extlinks in ms version of "Rampasan kuasa di Thailand 2006"
Error getting extlinks in th version of "รัฐประหารในประเทศไทย พ.ศ. 2549"
Error getting extlinks in tl version of "Kudeta sa Thailand (2006)"
Error getting extlinks in es version of "Golpe de Estado en Tailandia de 2006"
Error getting extlinks in br version of "EDSA II"
Error getting extlinks in zh version of "人民力量革命 (2001年)"
Error getting extlinks in ru version of "Вторая народная революция на Филиппинах"
Error getting extlinks in nl version of "EDSA-revolutie II"
Error getting extlinks in no version of "Kuppforsøket i Ekvatorial-Guinea i 2004"
Error getting extlinks in el version of "Πολιτική κρίση της Ονδούρας (2009)"
Error getting extlinks in zh version of "2009年宏都拉斯軍事政變"
Error getting extlinks in es version of "Golpe de Estado en Honduras de 2009"
Error getting extlinks in sv version of "Tulpanrevolutionen"
Error getting extlinks in ko version of "튤립 혁명"
Error getting extlinks in ar version of "ثورة التوليب"
Error getting extlinks in uk version of "Тюльпанова революція"
Error getting extlinks in kk version of "Қызғалдақ төңкерісі"
Error getting extlinks in nl version of "Tulpenrevolutie"
Error getting extlinks in eo version of "Tulipa revolucio"
Error getting extlinks in ar version of "انقلاب 2008 في موريتانيا"
Error getting extlinks in es version of "Golpe de Estado en Venezuela de 2002"
Error getting extlinks in hi version of "मालियन राज्यविप्लव २०१२"
Error getting extlinks in si version of "2014 තායි රාජ්‍ය විරෝධි කුමන්ත්‍රණය"
Error getting extlinks in sco version of "2014 Thai coup d'état"
Error getting extlinks in fr version of "Révolution abkhaze de 2014"
Error getting extlinks in en version of "2010 Ecuador crisis"
Error getting extlinks in el version of "Αιγυπτιακό πραξικόπημα 2013"
Error getting extlinks in ko version of "2013년 이집트 쿠데타"
Error getting extlinks in az version of "Misirdə hərbi çeviriliş (2013)"
Error getting extlinks in uk version of "Військовий переворот у Нігері (2010)"
Error getting extlinks in es version of "Golpe de Estado en Níger de 2010"
Error getting extlinks in ja version of "ニジェール軍事クーデター (2010年)"
Error getting extlinks in ka version of "თურქეთის სახელმწიფო გადატრიალების მცდელობა (2016)"
Error getting extlinks in diq version of "Tırkiya de teşebusê darbeyê eskeri 2016"
Error getting extlinks in lmo version of "Tentativ de Colp de Stat in Turchia del 2016"
Error getting extlinks in ml version of "തുർക്കിസൈനിക അട്ടിമറിശ്രമം"
Error getting extlinks in tr version of "2016 Türkiye askerî darbe girişimi"
Error getting extlinks in en version of "2016 Turkish coup d'état attempt"

Make graphs

g = nx.DiGraph()
for lang,file_dict in global_image_dict.items():
    for file,pagelist in file_dict.items():
        g.add_edge(lang,file,weight=len(pagelist))
        
nx.write_gexf(g,'global_images.gexf')
interlanguage_extlinks
{'af': Counter({'al-monitor.com': 1, 'gva.be': 1, 'npo.nl': 1}),
 'ar': Counter({'ahram-canada.com': 1,
          'ahram.org.eg': 9,
          'al-monitor.com': 1,
          'alarabiya.net': 2,
          'albayan.ae': 1,
          'alhayat.com': 1,
          'aljazeera.net': 8,
          'almadenahnews.com': 1,
          'almasryalyoum.com': 2,
          'alriyadh.com': 1,
          'alwafd.org': 2,
          'anbamoscow.com': 2,
          'annahar.com': 1,
          'assabeel.net': 1,
          'bbc.co.uk': 8,
          'carthage.tn': 1,
          'cnn.com': 3,
          'dailynewsegypt.com': 1,
          'dw.de': 1,
          'echoroukonline.com': 1,
          'egynews.net': 1,
          'egyptindependent.com': 1,
          'elaph.com': 1,
          'elbadil.com': 2,
          'elwatannews.com': 2,
          'euronews.com': 1,
          'facebook.com': 1,
          'france24.com': 2,
          'guardian.co.uk': 2,
          'hurriyetdailynews.com': 1,
          'marebpress.net': 1,
          'masrawy.com': 3,
          'nbcnews.com': 1,
          'newsmax.com': 1,
          'radiosawa.com': 1,
          'reuters.com': 9,
          'riy.cc': 1,
          'rt.com': 1,
          'shorouknews.com': 3,
          'skynewsarabia.com': 1,
          'usatoday.com': 1,
          'youm7.com': 1,
          'youtube.com': 3}),
 'arz': Counter({'alarabiya.net': 1,
          'archive.is': 1,
          'bbc.co.uk': 3,
          'cbsnews.com': 1,
          'dailynewsegypt.com': 1,
          'edmontonjournal.com': 1,
          'egyptindependent.com': 1,
          'focac.org': 1,
          'foxnews.com': 1,
          'ft.com': 1,
          'guardiannews.com': 1,
          'humanevents.com': 1,
          'mondediplo.com': 1,
          'nytimes.com': 2,
          'reuters.com': 1,
          'thestar.com': 1,
          'time.com': 1,
          'washingtonpost.com': 1,
          'wordpress.com': 1,
          'wsj.com': 1}),
 'bg': Counter({'aljazeera.com': 1,
          'bbc.com': 1,
          'cnn.com': 2,
          'nytimes.com': 1,
          'wikipedia.org': 2}),
 'ca': Counter({'20minutos.es': 1,
          'abc.es': 3,
          'ahram.org.eg': 1,
          'bbc.com': 1,
          'elmundo.es': 1,
          'go.com': 1,
          'guardian.co.uk': 1,
          'latimes.com': 1,
          'reuters.com': 1,
          'sky.com': 1,
          'theguardian.com': 1,
          'wikimedia.org': 1}),
 'ckb': Counter({'ahram.org.eg': 1,
          'aljazeera.com': 2,
          'dailymail.co.uk': 1,
          'rudaw.net': 2,
          'xendan.org': 2}),
 'de': Counter({'20min.ch': 2,
          'ahram.org.eg': 2,
          'akhbar-alkhaleej.com': 1,
          'aljazeera.com': 1,
          'alsharq.de': 1,
          'alwehda.gov.sy': 1,
          'assabah.com.tn': 1,
          'auswaertiges-amt.de': 2,
          'bbc.co.uk': 6,
          'brookings.edu': 1,
          'carnegieendowment.org': 2,
          'cihrs.org': 1,
          'dailymotion.com': 2,
          'dailynewsegypt.com': 3,
          'dailystar.com.lb': 1,
          'daserste.de': 1,
          'defense.gov': 1,
          'derstandard.at': 3,
          'diepresse.com': 1,
          'dradio.de': 1,
          'dw.de': 11,
          'elwatan.com': 1,
          'etemaad.ir': 1,
          'euronews.com': 1,
          'farsnews.com': 1,
          'faz.net': 12,
          'focus.de': 6,
          'gulfnews.com': 1,
          'haaretz.com': 2,
          'hamshahrionline.ir': 1,
          'handelsblatt.com': 3,
          'heise.de': 1,
          'hrw.org': 1,
          'huffingtonpost.com': 2,
          'independent.co.uk': 2,
          'isna.ir': 1,
          'israelhayom.com': 1,
          'kurier.at': 2,
          'lefigaro.fr': 1,
          'lorientlejour.com': 1,
          'n-tv.de': 6,
          'n24.de': 2,
          'ndr.de': 1,
          'neues-deutschland.de': 1,
          'nytimes.com': 4,
          'nzz.ch': 2,
          'ouest-france.fr': 1,
          'ovb-online.de': 1,
          'phoenix.de': 1,
          'profil.at': 1,
          'reuters.com': 11,
          'rferl.org': 1,
          'rp-online.de': 2,
          'sis.gov.eg': 1,
          'spiegel.de': 27,
          'stern.de': 3,
          'sueddeutsche.de': 17,
          'swp-berlin.org': 2,
          'tagesschau.de': 4,
          'tagesspiegel.de': 7,
          'taz.de': 6,
          'tehrantimes.com': 1,
          'theatlantic.com': 1,
          'thedailybeast.com': 2,
          'theeuropean.de': 1,
          'theguardian.com': 6,
          'timesofisrael.com': 2,
          'unbubble.eu': 1,
          'voanews.com': 7,
          'washingtonpost.com': 3,
          'webcitation.org': 172,
          'welt.de': 8,
          'whitehouse.gov': 1,
          'wikimedia.org': 1,
          'wissenschaft-und-frieden.de': 1,
          'wiwo.de': 1,
          'worldbulletin.net': 1,
          'yahoo.com': 2,
          'youtube.com': 6,
          'zeit.de': 20}),
 'en': Counter({'academia.edu': 1,
          'ahram.org.eg': 17,
          'al-monitor.com': 4,
          'alarabiya.net': 3,
          'aljazeera.com': 33,
          'allafrica.com': 1,
          'amazon.com': 1,
          'amnesty.org.uk': 2,
          'ap.org': 3,
          'archive.is': 2,
          'bbc.co.uk': 14,
          'bbc.com': 1,
          'bloomberg.com': 2,
          'buenosairesherald.com': 1,
          'canada.com': 2,
          'cancilleria.gov.co': 1,
          'carnegieendowment.org': 1,
          'cbc.ca': 1,
          'cbsnews.com': 2,
          'chron.com': 1,
          'cnn.com': 5,
          'cpj.org': 2,
          'csmonitor.com': 1,
          'dailykos.com': 1,
          'dailynewsegypt.com': 4,
          'doi.org': 4,
          'dw.de': 1,
          'economist.com': 3,
          'edmontonjournal.com': 2,
          'egyptindependent.com': 7,
          'everything-pr.com': 2,
          'facebook.com': 1,
          'focac.org': 1,
          'foreignaffairs.com': 1,
          'foxnews.com': 3,
          'france24.com': 2,
          'freep.com': 2,
          'ft.com': 1,
          'gallup.com': 1,
          'google.com': 1,
          'harakahdaily.net': 4,
          'hotair.com': 1,
          'house.gov': 1,
          'hrw.org': 2,
          'huffingtonpost.com': 9,
          'humanevents.com': 1,
          'hurriyetdailynews.com': 3,
          'independent.co.uk': 2,
          'interaksyon.com': 1,
          'israelnationalnews.com': 1,
          'itv.com': 1,
          'jpost.com': 2,
          'khabaronline.ir': 1,
          'middleeasteye.net': 1,
          'middleeastmonitor.com': 2,
          'nbcnews.com': 2,
          'newsmax.com': 2,
          'npr.org': 2,
          'nytimes.com': 7,
          'ohchr.org': 1,
          'oilprice.com': 1,
          'pewglobal.org': 1,
          'presstv.com': 6,
          'reuters.com': 13,
          'sky.com': 1,
          'smh.com.au': 2,
          'straitstimes.com': 2,
          'sudantribune.com': 1,
          'talkingpointsmemo.com': 1,
          'telegraph.co.uk': 4,
          'theatlanticwire.com': 2,
          'theaustralian.com.au': 2,
          'thedailybeast.com': 1,
          'theguardian.com': 5,
          'thenational.ae': 1,
          'thenews.com.pk': 1,
          'thestar.com': 2,
          'time.com': 2,
          'timesofisrael.com': 1,
          'tripolipost.com': 1,
          'twitter.com': 1,
          'upi.com': 1,
          'usat.ly': 1,
          'usatoday.com': 1,
          'voanews.com': 3,
          'washingtonpost.com': 2,
          'washingtontimes.com': 1,
          'wdbj7.com': 2,
          'wdsu.com': 2,
          'webcitation.org': 2,
          'wordpress.com': 2,
          'worldcat.org': 1,
          'wsj.com': 2,
          'yahoo.com': 8,
          'youtube.com': 2}),
 'es': Counter({'20minutos.es': 1,
          'abc.es': 1,
          'al-akhbar.com': 1,
          'alarabiya.net': 1,
          'aljazeera.com': 1,
          'amnesty.org': 2,
          'archive.is': 1,
          'bbc.co.uk': 6,
          'clarin.com': 1,
          'cnn.com': 1,
          'egyptindependent.com': 1,
          'eldinamo.cl': 1,
          'elmundo.es': 5,
          'elpais.com': 4,
          'go.com': 1,
          'guardian.co.uk': 4,
          'huffingtonpost.com': 1,
          'latercera.com': 1,
          'latimes.com': 1,
          'libyaherald.com': 3,
          'reuters.com': 2,
          'rtve.es': 1,
          'sky.com': 1,
          'telam.com.ar': 1,
          'telegraph.co.uk': 1,
          'telemadrid.es': 1,
          'theaustralian.com.au': 1,
          'theguardian.com': 1,
          'thestar.com': 1,
          'whitehouse.gov': 1}),
 'fa': Counter({'alhayat.com': 1,
          'alwehda.gov.sy': 1,
          'dw.de': 3,
          'egyptindependent.com': 1,
          'euronews.com': 3,
          'guardian.co.uk': 1,
          'irlister.com': 1,
          'isna.ir': 1,
          'neworientnews.com': 1,
          'presstv.com': 1,
          'tabnak.ir': 2,
          'timesofisrael.com': 1,
          'worldbulletin.net': 1}),
 'fi': Counter({'aamulehti.fi': 3,
          'alarabiya.net': 4,
          'aljazeera.com': 3,
          'bbc.co.uk': 3,
          'cnn.com': 1,
          'hs.fi': 15,
          'iltalehti.fi': 6,
          'iltasanomat.fi': 12,
          'kainuunsanomat.fi': 1,
          'l%C3%A4nsi-savo.fi': 1,
          'mtv.fi': 6,
          'mtv3.fi': 15,
          'ruvr.ru': 1,
          'satakunnankansa.fi': 1,
          'savonsanomat.fi': 1,
          'suomenkuvalehti.fi': 2,
          'talouselama.fi': 2,
          'taloussanomat.fi': 1,
          'yle.fi': 31}),
 'fr': Counter({'ahram.org.eg': 7,
          'alarabiya.net': 2,
          'allafrica.com': 1,
          'amnesty.ch': 4,
          'bbc.co.uk': 3,
          'depechesdugabon.com': 1,
          'dhnet.be': 1,
          'diplo.de': 1,
          'edmontonjournal.com': 1,
          'egyptindependent.com': 2,
          'euronews.com': 2,
          'europe1.fr': 1,
          'fjponline.com': 1,
          'guardian.co.uk': 1,
          'hotair.com': 1,
          'hrw.org': 2,
          'irib.ir': 1,
          'irishtimes.com': 1,
          'jn1.tv': 1,
          'ladepeche.fr': 1,
          'lefigaro.fr': 4,
          'lejdd.fr': 1,
          'lemonde.fr': 6,
          'leparisien.fr': 1,
          'lesechos.fr': 2,
          'levif.be': 1,
          'lexpress.fr': 1,
          'liberation.fr': 1,
          'mediapart.fr': 1,
          'mei.edu': 1,
          'monde-diplomatique.fr': 1,
          'mondediplo.net': 4,
          'ouest-france.fr': 1,
          'reuters.com': 1,
          'rian.ru': 1,
          'rtbf.be': 1,
          'rtl.fr': 2,
          'telegraph.co.uk': 1,
          'thestar.com': 1,
          'time.com': 1,
          'timesofisrael.com': 1,
          'yahoo.com': 2}),
 'he': Counter({'arabicmedia.co.il': 4,
          'haaretz.co.il': 2,
          'mako.co.il': 1,
          'news1.co.il': 1,
          'nrg.co.il': 5,
          'themarker.com': 1,
          'walla.co.il': 4,
          'ynet.co.il': 8}),
 'hi': Counter({'bbc.co.uk': 2,
          'cnn.com': 1,
          'india.com': 1,
          'indiatimes.com': 1,
          'livehindustan.com': 1,
          'patrika.com': 1}),
 'id': Counter({'bangkokpost.com': 1,
          'bbc.co.uk': 4,
          'cnn.com': 1,
          'dawn.com': 1,
          'guardian.co.uk': 2,
          'telegraph.co.uk': 1,
          'thestar.com': 2,
          'timesofoman.com': 1}),
 'it': Counter({'adnkronos.com': 1,
          'agi.it': 1,
          'ansa.it': 2,
          'arabpress.eu': 2,
          'archive.is': 35,
          'avvenire.it': 1,
          'bbc.co.uk': 1,
          'corriere.it': 4,
          'diocesi.torino.it': 2,
          'fanpage.it': 3,
          'hdn.com.tr': 1,
          'huffingtonpost.it': 1,
          'hurriyetdailynews.com': 1,
          'ilfattoquotidiano.it': 2,
          'ilmanifesto.it': 7,
          'ilmondo.it': 2,
          'ilpost.it': 1,
          'ilsecoloxix.it': 1,
          'internazionale.it': 2,
          'jpost.com': 1,
          'lastampa.it': 1,
          'lettera43.it': 2,
          'panorama.it': 1,
          'repubblica.it': 5,
          'reuters.com': 1,
          'swissinfo.ch': 2,
          'tmnews.it': 1,
          'today.it': 2,
          'wikimedia.org': 2}),
 'ja': Counter({'47news.jp': 3,
          'afpbb.com': 19,
          'ahram.org.eg': 30,
          'alarabiya.net': 3,
          'aljazeera.com': 3,
          'allafrica.com': 1,
          'almasryalyoum.com': 2,
          'archive.is': 7,
          'asahi.com': 9,
          'assabeel.net': 1,
          'aswatmasriya.com': 1,
          'bbc.co.uk': 1,
          'bloomberg.com': 1,
          'businessmirror.com.ph': 1,
          'cbe.org.eg': 7,
          'christiantoday.co.jp': 2,
          'chugoku-np.co.jp': 1,
          'cnn.co.jp': 7,
          'dailynewsegypt.com': 10,
          'diplo.jp': 1,
          'egyptindependent.com': 2,
          'emb-japan.go.jp': 1,
          'getnews.jp': 1,
          'hokkaido-np.co.jp': 1,
          'huffingtonpost.jp': 1,
          'ikhwanweb.com': 1,
          'irib.ir': 2,
          'ismedia.jp': 1,
          'jcp.or.jp': 7,
          'jetro.go.jp': 1,
          'jiji.com': 12,
          'kuna.net.kw': 1,
          'maannews.net': 3,
          'mainichi.jp': 6,
          'mofa.go.jp': 1,
          'msn.com': 11,
          'newsweekjapan.jp': 3,
          'nhk.or.jp': 8,
          'nicovideo.jp': 1,
          'nikkei.com': 13,
          'nikkeibp.co.jp': 1,
          'nippon.com': 1,
          'nuqudy.com': 1,
          'pjin.jp': 1,
          'presstv.com': 1,
          'presstv.ir': 19,
          'reuters.com': 19,
          'ruvr.ru': 2,
          'the-liberty.com': 1,
          'thehindu.com': 1,
          'thenational.ae': 1,
          'timesofisrael.com': 1,
          'tkfd.or.jp': 1,
          'tokyo-np.co.jp': 3,
          'tradingeconomics.com': 2,
          'tufs.ac.jp': 2,
          'worldbulletin.net': 9,
          'worldtimes.co.jp': 3,
          'wsj.com': 2,
          'yomiuri.co.jp': 3,
          'youtube.com': 2,
          'zaikei.co.jp': 1,
          'zakzak.co.jp': 1}),
 'nl': Counter({'al-monitor.com': 1,
          'gva.be': 1,
          'npo.nl': 1,
          'nrc.nl': 1,
          'wikimedia.org': 1}),
 'pl': Counter({'bbc.co.uk': 1,
          'businessinsider.com': 1,
          'mscichowscy.pl': 1,
          'nbcnews.com': 1,
          'tvn24.pl': 2,
          'tvp.info': 1,
          'wp.pl': 1,
          'wprost.pl': 1}),
 'pt': Counter({'ahram.org.eg': 1,
          'al-monitor.com': 1,
          'aljazeera.com': 2,
          'bangkokpost.com': 1,
          'bbc.co.uk': 2,
          'chron.com': 1,
          'cnn.com': 3,
          'dawn.com': 1,
          'dw.de': 1,
          'estadao.com.br': 1,
          'guardian.co.uk': 2,
          'hurriyetdailynews.com': 1,
          'independent.co.uk': 1,
          'newsmax.com': 1,
          'ruvr.ru': 1,
          'telegraph.co.uk': 1,
          'thestar.com': 1,
          'timesofoman.com': 1,
          'usatoday.com': 1}),
 'ro': Counter({'ahram.org.eg': 1,
          'alarabiya.net': 1,
          'aljazeera.com': 1,
          'archive.is': 1,
          'bbc.co.uk': 5,
          'cnn.com': 1,
          'egyptindependent.com': 3,
          'guardian.co.uk': 1,
          'nytimes.com': 1,
          'reuters.com': 1,
          'romanialibera.ro': 1,
          'telegraph.co.uk': 1,
          'thestar.com': 1,
          'washingtontimes.com': 1,
          'whitehouse.gov': 1,
          'youtube.com': 1}),
 'ru': Counter({'9tv.co.il': 7,
          'aamulehti.fi': 1,
          'ca-news.org': 2,
          'cliodynamics.ru': 3,
          'fontanka.ru': 5,
          'google.com': 1,
          'hs.fi': 1,
          'kommersant.ru': 1,
          'lenta.ru': 3,
          'mignews.com': 1,
          'newsru.co.il': 5,
          'newsru.com': 1,
          'pravmir.ru': 1,
          'rambler.ru': 1,
          'reuters.com': 1,
          'rg.ru': 1,
          'ria.ru': 1,
          'rt.com': 1,
          'ruvr.ru': 1,
          'svoboda.org': 1,
          'tribuna.ru': 1,
          'webcitation.org': 4,
          'yle.fi': 1,
          'youtube.com': 1}),
 'sr': Counter({'ahram.org.eg': 2,
          'al-monitor.com': 1,
          'alarabiya.net': 1,
          'aljazeera.com': 1,
          'bbc.co.uk': 4,
          'cliodynamics.ru': 1,
          'cnn.com': 2,
          'dw.de': 1,
          'edmontonjournal.com': 1,
          'guardian.co.uk': 1,
          'hotair.com': 1,
          'hurriyetdailynews.com': 2,
          'independent.co.uk': 1,
          'newsmax.com': 1,
          'npr.org': 1,
          'reuters.com': 1,
          'theaustralian.com.au': 1,
          'thestar.com': 1,
          'time.com': 1,
          'usatoday.com': 1}),
 'tg': Counter({'google.com': 1, 'ozodi.org': 1, 'wikipedia.org': 1}),
 'tr': Counter({'ahram.org.eg': 5,
          'al-monitor.com': 1,
          'alarabiya.net': 1,
          'aljazeera.com': 2,
          'bangkokpost.com': 1,
          'bbc.co.uk': 3,
          'chron.com': 1,
          'cnn.com': 2,
          'dailynewsegypt.com': 1,
          'dawn.com': 1,
          'dw.de': 1,
          'edmontonjournal.com': 2,
          'guardian.co.uk': 3,
          'hrw.org': 1,
          'hurriyet.com.tr': 2,
          'hurriyetdailynews.com': 1,
          'independent.co.uk': 1,
          'newsmax.com': 1,
          'ntvmsnbc.com': 1,
          'telegraph.co.uk': 1,
          'thestar.com': 1,
          'time.com': 2,
          'timesofoman.com': 1,
          'trt.net.tr': 1,
          'usatoday.com': 1,
          'wikimedia.org': 1,
          'wsj.com': 1}),
 'uk': Counter({'9tv.co.il': 5,
          'ahram.org.eg': 1,
          'ca-news.org': 2,
          'cliodynamics.ru': 1,
          'dailynewsegypt.com': 1,
          'egyptindependent.com': 1,
          'fontanka.ru': 4,
          'hurriyetdailynews.com': 1,
          'kommersant.ru': 1,
          'korrespondent.net': 1,
          'lenta.ru': 3,
          'mignews.com': 1,
          'newsru.co.il': 4,
          'pravmir.ru': 1,
          'rambler.ru': 1,
          'ria.ru': 1,
          'rt.com': 1,
          'svoboda.org': 1,
          'timesofisrael.com': 1,
          'webcitation.org': 4,
          'youtube.com': 1}),
 'ur': Counter({'dawn.com': 1, 'guardian.co.uk': 1}),
 'vi': Counter({'ahram.org.eg': 1,
          'alarabiya.net': 1,
          'aljazeera.com': 3,
          'bbc.co.uk': 6,
          'buenosairesherald.com': 1,
          'canada.com': 1,
          'cbc.ca': 2,
          'cbsnews.com': 1,
          'cnn.com': 3,
          'dailymail.co.uk': 1,
          'dailynewsegypt.com': 1,
          'dw.de': 1,
          'egyptindependent.com': 2,
          'expressandstar.com': 1,
          'guardian.co.uk': 1,
          'handelsblatt.com': 1,
          'heute.de': 1,
          'hotair.com': 1,
          'n-tv.de': 1,
          'n24.de': 1,
          'ohchr.org': 1,
          'reuters.com': 2,
          'smh.com.au': 2,
          'spiegel.de': 4,
          'sueddeutsche.de': 1,
          'tagesschau.de': 1,
          'telegraph.co.uk': 2,
          'thestar.com': 1,
          'wdbj7.com': 1,
          'whitehouse.gov': 1,
          'wikimedia.org': 1,
          'ynetnews.com': 1,
          'zeit.de': 1}),
 'zh': Counter({'163.com': 12,
          'aljazeera.com': 1,
          'apdnews.com': 1,
          'appledaily.com.tw': 1,
          'bbc.co.uk': 3,
          'china.com.cn': 2,
          'chinanews.com': 2,
          'cw.com.tw': 1,
          'ecmeg.com': 1,
          'foxnews.com': 1,
          'guardian.co.uk': 1,
          'hexun.com': 1,
          'ifeng.com': 1,
          'israelnationalnews.com': 1,
          'jfdaily.com': 1,
          'jschina.com.cn': 1,
          'k618.cn': 1,
          'middleeastmonitor.com': 2,
          'qantara.de': 1,
          'reuters.com': 1,
          'taiwan.cn': 1,
          'tariqramadan.com': 1,
          'voachinese.com': 1,
          'wsj.com': 1,
          'xinhuanet.com': 1,
          'youth.cn': 1,
          'zaobao.com': 16})}
extlink_g = nx.DiGraph()
for lang,link_counter in interlanguage_extlinks.items():
    extlink_g.add_node(lang,nodetype='lang')
    for domain,count in link_counter.items():
        extlink_g.add_node(domain,nodetype='domain')
        extlink_g.add_edge(lang,domain,weight=count)
        
print("There are {0:,} nodes and {1:,} edges in the network".format(extlink_g.number_of_nodes(), extlink_g.number_of_edges()))

nx.write_gexf(extlink_g,'2013_Egyptian_coup_extlinks.gexf')
There are 408 nodes and 686 edges in the network
 

Old and busted code

Scrape image category memberships

Get the category memberships

Also should clean up the image files to include the missing 120ish Emily hand-downloaded because of missing/broken prefixes or not appearing on Wikimedia commons.

with open('all_coups_file_image_usage.json','r') as f:
    all_coups_image_usage = json.load(f)
full_image_list = list()
for article, article_payload in all_coups_image_usage.items():
    for lang,file_payload in article_payload.items():
        for filename,pagelist in file_payload.items():
            full_image_list.append((lang,filename))
                
print("There are {0} images in the full_image_list".format(len(full_image_list)))

full_image_list[:5]
There are 1494 images in the full_image_list
[('ar', 'ملف:Mhdawi.jpg'),
 ('ar', 'ملف:Trial iraq.jpg'),
 ('ar', 'ملف:Tabaqchali1.jpg'),
 ('pl', 'Plik:Abd al-Wahab Shawaf.png'),
 ('en', 'File:Maxwell D Taylor official portrait.jpg')]
reformatted_image_category_memberships = {}
for (lang,file),l in commons_files_categories.items():
    if lang not in reformatted_image_category_memberships:
        reformatted_image_category_memberships[lang] = {}
        reformatted_image_category_memberships[lang][file] = l
    else:
        reformatted_image_category_memberships[lang][file] = l
        
with open('image_category_memberships.json','w') as f:
    json.dump(reformatted_image_category_memberships,f)
image_category_memberships = {}

for (lang,image) in full_image_list:
    if lang not in image_category_memberships:
        image_category_memberships[lang] = dict()
         
    try:
        image_category_memberships[lang][image] = get_commons_category_memberships('File:' + image.split(':')[1])

    except KeyError:
        image_category_memberships[lang][image] = get_category_memberships(image,lang)

    except KeyboardInterrupt:
        raise

    except:
        print("Error on {0}".format(image))
        pass

#reformatted_image_category_memberships = {}
#for (lang,file),l in commons_files_categories.items():
#    if lang not in reformatted_image_category_memberships:
#        reformatted_image_category_memberships[lang] = {}
#        reformatted_image_category_memberships[lang][file] = l
#    else:
#        reformatted_image_category_memberships[lang][file] = l    

with open('image_category_memberships.json','w') as f:
    json.dump(image_category_memberships,f)
Error on File:2014 0526 Thailand coup Chang Phueak Gate Chiang Mai 02.jpg
Error on File:Thai-coup-detat-2014-social-media-banner.jpg
Error on File:Prayuth Jan-ocha 2010-06-17 Cropped.jpg
with open('image_category_memberships.json','w') as f:
    json.dump(image_category_memberships,f)

Scrape other images in categories

with open('image_category_memberships.json','r') as f:
    image_category_memberships = json.load(f)
image_category_list = []

for lang, lang_dict in image_category_memberships.items():
    for file, category_list in lang_dict.items():
        for category_dict in category_list:
            for category_name, date in category_dict.items():
                image_category_list.append(category_name)

image_category_list = list(set(image_category_list))

print("There are {0:,} unique categories".format(len(image_category_list)))
There are 1,135 unique categories
commons_category_file_members_dict = {}
year_cats =  ['Category:'+str(i) for i in range(1950,2018)]

for cat in image_category_list:
    if all(e.lower() not in cat.lower() for e in ['births','deaths','Media needing categories']) and cat not in year_cats:
        commons_category_file_members_dict[cat] = get_commons_category_file_members(cat)
    
with open('commons_category_file_members.json','w') as f:
    json.dump(commons_category_file_members_dict,f)
commons_category_files = []

for cat, members in commons_category_file_members_dict.items():
    commons_category_files += members
    
commons_category_files = list(set(commons_category_files))    

print("There are {0:,} files in categories used by coup images".format(len(commons_category_files)))
There are 149,038 filea in categories used by coup images