import sys
import json
import csv
import requests 
import pprint
import re
from tqdm import tqdm_notebook
from urllib.parse import quote
from urllib.request import urlopen
from urllib.request import urlretrieve
from multiprocessing.pool import ThreadPool
from time import time as timer
import time

import pandas as pd

# Load Azure Custom Vision API endpoint URL and API key
with open('azurecredentials.json') as f:
    data = json.load(f)
    PREDICTION_KEY = data['predictionkey']
    AZURE_CV_URL   = data['azure_cv_url']
# Set POST request header to JSON and our secret key
headers = {'Content-Type' : 'application/json', 'Prediction-Key' : PREDICTION_KEY }

metapibaseurl  = '{}'

threshold = 0.6   # confidence threshold to exceed for Azure predictions to be returned

maxthreads = 15   # How many parallel threads - WARNING - too many and Azure returns error

# Need to handle Azure error:
# {'statusCode': 429, 'message': 'Rate limit is exceeded. Try again in 1 seconds.'}
# Quotas:
maxtries = 20
waitinterval = 1

max_azure_image_size = 4000000   # In bytes, largest image size Azure CV will accept

# Take image URL, feed it to Azure Custom Vision instance
# Return dataframe of results
def imageurl2predictiondf(url):
    # Handle Azure rate limit by adding retry count and wait interval
    tries = 0
    wait  = waitinterval
    payload = '{"url":"' + url.strip() + '"}'
    while tries < maxtries :
            # Send POST request to our Azure Custom Vision API
            r =, headers=headers, data=payload).json()
            # Successful predictions returned
            if 'predictions' in r:
                df = pd.DataFrame(r['predictions'],columns=['tagName','probability'])
                return df
            # Error, probably rate limiting
            elif 'statusCode' in r and r['statusCode'] == 429:   # Need to sleep and retry
                # print ('Sleeping {} for {} seconds, try {}'.format(url,wait,tries))
                time.sleep(wait)     # wait a bit
                tries += 1           # record another try
                # wait += waitinterval # increase wait period by interval in case of retry
            # Unexpected error condition
            print ("imageurl2predictiondf:","error returned from Azure for",url, file=sys.stderr)
            return None
        except Exception as e:
            # print ('Excpetion',e)
            return None
    print('imageurl2predictiondf:',' retry limit after ', tries, url, file=sys.stderr)
    return None

# Given a Commons file name from WD (P18) return full URL
# In: September SAAM-1909.7.13 1.jpg
# Out:
def commonsfile2url (filename):
    COMMONS_API_URL = '{}&prop=imageinfo&iiprop=url&format=json'
    cqueryurl = COMMONS_API_URL.format(quote(filename))

    with urlopen(cqueryurl) as newurl:
        response = json.loads(
        # Need to process query►pages►66384523►title                                                                                                     
    if response["query"]: # and response["query"]["pages"][0]["title"]:                                                                              
            # Get query►pages►66384523►imageinfo►0►url                                                                                                   
            for k, v in response["query"]["pages"].items():
                returnurl = v["imageinfo"][0]["url"]
    return returnurl

# Convert a Wikidata QID to image filename via their API
# Input: Q42
# Output: 'Douglas adams portrait cropped.jpg'
def qid2commonsfile(qid):
    wdurl = WD_API_URL.format(qid.rstrip(),imageproperty)
    filename = ''

    with urlopen(wdurl) as url:
        wd_r = json.loads(
        # Check if claims has anything in this area                                                                                                  
        if wd_r["claims"] and wd_r["claims"][imageproperty][0]["mainsnak"]:
        # Check that:                                                                                                                                
         # claims►P18►0►mainsnak►datatype = 'commonsMedia'                                                                                            
         # claims►P18►0►mainsnak►datavalue►value = <filename>                                                                                         
            datatype = wd_r["claims"][imageproperty][0]["mainsnak"]["datatype"]
            filename = wd_r["claims"][imageproperty][0]["mainsnak"]["datavalue"]["value"]

        if filename:
            return None

# Commons URL string types        

# Prep for matching regexp of image file types so they can be converted to JPG
jpattern = re.compile('.*(JPG|jpg|jpeg)$')
ppattern = re.compile('.*(PNG|png)$')
tpattern = re.compile('.*(TIF|tif|TIFF|tiff)$')
gpattern = re.compile('.*(GIF|gif)$')

def qid2predictionlist(qid):

    # Flag whether a big image file is being used, so that we wait for time to resize
    imagefile_resize_wait = 0
    # Get name of the Commons image for the Wikidata item
    commonsfilename = qid2commonsfile(qid)
    # print ("commonsfilename", commonsfilename)

    # Check the size, to make sure it is < 4 Mbytes, or Azure will not accept it
    w, h, size = commonsfiledimensions(quote(commonsfilename))

    # Find out the full URL of the Commons file
    commonsfileurl = commonsfile2url(commonsfilename)
    # Eventual URL sent to Azure may need to be altered to point to a smaller
    #   JPG version. Initialize with the original
    payloadurl = commonsfileurl
    # Prep the thumbnail url prefix portion, in case it's needed for sizing down
    thumbimageurl = commonsfileurl.replace(fromstring,tostring)

    # If JPG, check the size
    if jpattern.match(commonsfilename):
        # If the file is bigger than max size (4 Mbytes)
        if size > max_azure_image_size:
            # BIG JPG/jpg, use scaled version instead
            # Need to resize by rewriting the URL as thumb
            payloadurl = thumbimageurl + '/1536px-' + quote(commonsfilename)

    # If the file is a format not JPG, we need to request JPG version from Commons
    elif tpattern.match(commonsfilename):
        # TIFF file, scale down and add "jpg" extension to return a smaller jpg
        # Most TIFF artworks are huge (50 Mbytes or so)
        imagefile_resize_wait = 1
        payloadurl = thumbimageurl + '/1536px-' + quote(commonsfilename) + '.jpg'
    elif ppattern.match(commonsfilename):
        # PNG file, compromise, 1024px resizing to jpg
        # Most PNG files are fairly small, but some are massive
        imagefile_resize_wait = 1
        payloadurl = thumbimageurl + '/1024px-' + quote(commonsfilename) + '.jpg'
    elif gpattern.match(commonsfilename):
        # GIF file, scale down and add "jpg" extension to return a smaller jpg
        # Most GIF artworks are likely very low resolution
        payloadurl = thumbimageurl + '/1024px-' + quote(commonsfilename) + '.jpg'
        # An image file that we don't understand
        print (qid,"filetype error: " + commonsfilename)
        return qid, None
    # If the image bring requested is a file being resized (likely, down from a big TIFF)
    # Pull the resized image to /dev/null in order to prime the cache on Commons
    # This way, when the Azure side pulls the image, it doesn't need to wait
    #   for large files to be downsampled. Sometimes this takes more than 10 seconds
    #   for a 60 Mbyte TIFF to be converted to small JPG and perhaps 
    #   Azure is timing out.
    if imagefile_resize_wait:
        while tries < maxtries :
                # Transfer the large resized file to a black hole
                urlretrieve(payloadurl, "/dev/null")
            except Exception as e:
                tries += 1
                print (qid,'priming Commons cache exception: try ',tries,e,'for',payloadurl)

    # payloadurl should now be set for request
    # Get a prediction from the Azure side
    df = imageurl2predictiondf(payloadurl)
    if df is None:
        print ('qid2predictionlist: no dataframe returned for', qid, file=sys.stderr)
        return qid, None
        if df[df['probability']>threshold].empty:
            return qid, None
        return qid, df[df['probability']>threshold]['tagName'].tolist()

# Given a Commons filename return width, height and size from API
# Input: foo.jpg
# Output: (3354, 5165, 51992602)
def commonsfiledimensions(filename):
    w, h, size = 0, 0, 0
        wdurl = COMMONS_API_URL.format(filename)
        commons_r = None
        with urlopen(wdurl) as url:
            commons_r = json.loads(
        if commons_r['query'] and commons_r['query']['pages']:
            imagedict = commons_r['query']['pages']
            w = list(imagedict.values())[0]['imageinfo'][0]['width']
            h = list(imagedict.values())[0]['imageinfo'][0]['height']
            size = list(imagedict.values())[0]['imageinfo'][0]['size']
            return w, h, size
            return None, None, None
    except Exception as e:
        print ('Excpetion',e)
        return None, None, None
# qid2commonsfile('Q42')
# qid2predictionlist('Q42')

threshold = 0.55
('Q3464082', ['Women'])
outputfile = 'predictions.csv'

# Set SPARQL query, make sure main Wikdiata numbers are returned in "?q"
query = '''
  VALUES ?collections { wd:Q671384 }
  ?q wdt:P195 ?collections .
  ?q wdt:P18 ?image .

# Execute Wikidata query
wdqurl = ''
data = requests.get(wdqurl, params={'query': query, 'format': 'json'}).json()

# Make array of qids
qids = []
for item in data['results']['bindings']:
    qnumber ="(Q\d+)$", item['q']['value']).group(0)
# Parallel processing using threads
start = timer()
results = ThreadPool(maxthreads).imap_unordered(qid2predictionlist, qids)

outfile = open(outputfile,'w',buffering=1)  # Buffering=1 writes lines to files immediately
maxitems = len(qids)

for id, prediction in tqdm_notebook(results,total=maxitems):
    outfile.write("{},{}\n".format(id, prediction))

print("Elapsed Time (parallel threads): %s" % (timer() - start,))

Elapsed Time (parallel threads): 659.2421734333038