# -*- coding: utf-8  -*-
import pywikibot
import re
import pywikibot.data.sparql
import time
from pywikibot.comms import http
import json
from pywikibot import pagegenerators
import urllib.request
from urllib.parse import unquote

def regex(text):
    newText = ""
    newText = re.sub(r'(<|&lt;)br\s*\/*(>|&gt;)', '', text) #remove <br
    newText = re.sub(r'\<.*', '', text) #remove <sm...
    newText = re.sub(r'\[\[(?:[^|\]]*\|)?([^\]]+)\]\]',r'\1', newText) #remove [[stockholm|stook]]
    return newText
    
def getCaptions(mediaid, jsonF=False):
    try:     #https://commons.wikimedia.org/w/api.php?action=wbgetentities&format=json&ids=M4026453
        jsonurl = urllib.request.urlopen(u'https://commons.wikimedia.org/wiki/Special:EntityData/{}.json'.format(mediaid))
        data = json.load(jsonurl)
        if jsonF == True:
            return [lang for lang in data['entities'][mediaid]['labels']]
        else:
            for lang in data['entities'][mediaid]['labels']:
                print("{}: {}".format(lang, data['entities'][mediaid]['labels'][lang]['value']))
    except:
        return ""
        
        
def addCaptions(mediaid, value, pageid, ID):
    labels = {}
    existingCap = getCaptions(mediaid,jsonF=True)
    
    for (lang), (text)  in zip(value[pageid]['lang'], value[pageid]['text']):
        if lang not in existingCap:
            labels[lang] = {u'language' : lang, 'value' : text}
            
    if not labels:
        pywikibot.output('Didnt add anything to {0}. Skipping.'.format(mediaid))
        return ""
        
    tokenrequest = http.fetch(u'https://commons.wikimedia.org/w/api.php?action=query&meta=tokens&type=csrf&format=json')
    tokendata = json.loads(tokenrequest.text)
    token = tokendata.get(u'query').get(u'tokens').get(u'csrftoken')

    summary = u'adding captions for {0} languages based on [[d:{1}|{1}]] qualifier(s), media legend (P2096)'.format(len(value[pageid]['lang']), ID) 
    pywikibot.output(mediaid + u' ' + summary)

    postdata = {u'action' : u'wbeditentity',
                u'format' : u'json',
                u'id' : mediaid,
                u'data' : json.dumps({ u'labels' : labels}),
                u'token' : token,
                u'summary' : summary,
                u'bot' : True,
                }
    apipage = http.fetch(u'https://commons.wikimedia.org/w/api.php', method='POST', data=postdata)

def runOnCommons(captions, ID):

    repo = pywikibot.Site().data_repository()
    site = pywikibot.Site(u'commons', u'commons')
    for pageid in captions:
        mediaid = u'M%s' % (pageid,)
        addCaptions(mediaid, captions, pageid, ID)

def main(*args):
    """
    Main function. Grab a generator and pass it to the bot to work on
    """
    repo = pywikibot.Site("wikidata", "wikidata").data_repository()
    mediaLeg = u"""SELECT DISTINCT ?item
    WHERE
    {
     ?item wdt:P31 wd:Q5.
     ?item p:P18 ?statement.
     ?statement ps:P18 ?image.
     ?statement pq:P2096 ?media.
    }"""
    personGen = pagegenerators.PreloadingEntityGenerator(pagegenerators.WikidataSPARQLPageGenerator(mediaLeg,site=repo))
    count = 0
    for item in personGen:
        if item.isRedirectPage():
            pywikibot.output('{0} is a redirect page. Skipping.'.format(item))
            continue
        item.get() #Get the item dictionary
        value = {}
        val = None
        pageid = None
        print('--> ' + item.getID() + ': ')
        for claim in item.claims['P18']: #Finds all statements (P131)
            value[str(claim.getTarget().pageid)] = {'lang':[], 'text':[]}

            if 'P2096' in claim.qualifiers:
                for qual in claim.qualifiers['P2096']:
                    try:
                        value[str(claim.getTarget().pageid)]['lang'].append(qual.getTarget().language)
                        value[str(claim.getTarget().pageid)]['text'].append(regex(qual.getTarget().text))
                    except:
                        continue
                    
        if value:
            runOnCommons(value, item.getID())
            count += 1
            for pageid in value:
                for (lang), (text)  in zip(value[pageid]['lang'], value[pageid]['text']):
                    print('{}; {}: {}'.format(pageid,lang, text))
                
        if count == 50:
            pywikibot.output('Sleep for 60 sec. ...')
            time.sleep(60)
        
if __name__ == "__main__":
    main()
--> Q3323160: 
M48995522 adding captions for 1 languages based on [[d:Q3323160|Q3323160]] qualifier(s), media legend (P2096)
48995522; es: Martos pintado por Ignacio Suárez Llanos, 1874.
--> Q3163743: 
M51226771 adding captions for 1 languages based on [[d:Q3163743|Q3163743]] qualifier(s), media legend (P2096)
51226771; fr: Jean-Auguste Brutails vers 1896.
--> Q24642302: 
M58611309 adding captions for 1 languages based on [[d:Q24642302|Q24642302]] qualifier(s), media legend (P2096)
58611309; ru: Евгений Орлов в жюри фестиваля Generation NEXT Dance
--> Q3025572: 
M4314970 adding captions for 1 languages based on [[d:Q3025572|Q3025572]] qualifier(s), media legend (P2096)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/srv/paws/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    378                 # Python 2.7, use buffering of HTTP responses
--> 379                 httplib_response = conn.getresponse(buffering=True)
    380             except TypeError:

TypeError: getresponse() got an unexpected keyword argument 'buffering'

During handling of the above exception, another exception occurred:

KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-2-3b0ce09562f9> in <module>
    114 
    115 if __name__ == "__main__":
--> 116     main()

<ipython-input-2-3b0ce09562f9> in main(*args)
    103 
    104         if value:
--> 105             runOnCommons(value, item.getID())
    106             count += 1
    107             for pageid in value:

<ipython-input-2-3b0ce09562f9> in runOnCommons(captions, ID)
     65     for pageid in captions:
     66         mediaid = u'M%s' % (pageid,)
---> 67         addCaptions(mediaid, captions, pageid, ID)
     68 
     69 def main(*args):

<ipython-input-2-3b0ce09562f9> in addCaptions(mediaid, value, pageid, ID)
     57                 u'bot' : True,
     58                 }
---> 59     apipage = http.fetch(u'https://commons.wikimedia.org/w/api.php', method='POST', data=postdata)
     60 
     61 def runOnCommons(captions, ID):

/srv/paws/pwb/pywikibot/comms/http.py in fetch(uri, method, params, body, headers, default_error_handling, use_fake_user_agent, data, **kwargs)
    519             headers['user-agent'] = fake_user_agent()
    520 
--> 521     request = _enqueue(uri, method, params, body, headers, **kwargs)
    522     # if there's no data in the answer we're in trouble
    523     assert request._data is not None

/srv/paws/pwb/pywikibot/comms/http.py in _enqueue(uri, method, params, body, headers, data, **kwargs)
    475     request = threadedhttp.HttpRequest(
    476         uri, method, params, body, all_headers, callbacks, **kwargs)
--> 477     _http_process(session, request)
    478     return request
    479 

/srv/paws/pwb/pywikibot/comms/http.py in _http_process(session, http_request)
    388                                    headers=headers, auth=auth, timeout=timeout,
    389                                    verify=not ignore_validation,
--> 390                                    **http_request.kwargs)
    391     except Exception as e:
    392         http_request.data = e

/srv/paws/lib/python3.6/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    531         }
    532         send_kwargs.update(settings)
--> 533         resp = self.send(prep, **send_kwargs)
    534 
    535         return resp

/srv/paws/lib/python3.6/site-packages/requests/sessions.py in send(self, request, **kwargs)
    644 
    645         # Send the request
--> 646         r = adapter.send(request, **kwargs)
    647 
    648         # Total elapsed time of the request (approximately)

/srv/paws/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    447                     decode_content=False,
    448                     retries=self.max_retries,
--> 449                     timeout=timeout
    450                 )
    451 

/srv/paws/lib/python3.6/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    601                                                   timeout=timeout_obj,
    602                                                   body=body, headers=headers,
--> 603                                                   chunked=chunked)
    604 
    605             # If we're going to release the connection in ``finally:``, then

/srv/paws/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    381                 # Python 3
    382                 try:
--> 383                     httplib_response = conn.getresponse()
    384                 except Exception as e:
    385                     # Remove the TypeError from the exception chain in Python 3;

/usr/lib/python3.6/http/client.py in getresponse(self)
   1329         try:
   1330             try:
-> 1331                 response.begin()
   1332             except ConnectionError:
   1333                 self.close()

/usr/lib/python3.6/http/client.py in begin(self)
    295         # read until we get a non-100 response
    296         while True:
--> 297             version, status, reason = self._read_status()
    298             if status != CONTINUE:
    299                 break

/usr/lib/python3.6/http/client.py in _read_status(self)
    256 
    257     def _read_status(self):
--> 258         line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    259         if len(line) > _MAXLINE:
    260             raise LineTooLong("status line")

/usr/lib/python3.6/socket.py in readinto(self, b)
    584         while True:
    585             try:
--> 586                 return self._sock.recv_into(b)
    587             except timeout:
    588                 self._timeout_occurred = True

/usr/lib/python3.6/ssl.py in recv_into(self, buffer, nbytes, flags)
   1010                   "non-zero flags not allowed in calls to recv_into() on %s" %
   1011                   self.__class__)
-> 1012             return self.read(nbytes, buffer)
   1013         else:
   1014             return socket.recv_into(self, buffer, nbytes, flags)

/usr/lib/python3.6/ssl.py in read(self, len, buffer)
    872             raise ValueError("Read on closed or unwrapped SSL socket.")
    873         try:
--> 874             return self._sslobj.read(len, buffer)
    875         except SSLError as x:
    876             if x.args[0] == SSL_ERROR_EOF and self.suppress_ragged_eofs:

/usr/lib/python3.6/ssl.py in read(self, len, buffer)
    629         """
    630         if buffer is not None:
--> 631             v = self._sslobj.read(len, buffer)
    632         else:
    633             v = self._sslobj.read(len)

KeyboardInterrupt: