# coding: utf-8
# In[8]:
import pywikibot
from collections import namedtuple
import hashlib
import re
import time
from base64 import b64encode
import requests
import json
# In[50]:
pybot_username="Venkateswaran raman"
pybot_docname="ललितविस्तरः.pdf"
pybot_pages="1:453"
# In[51]:
class OCRException(Exception):
pass
class retry(object):
def __init__(self,retries,sleep,exception):
self.retries = retries
self.sleep = sleep
self.exception = exception
def __call__(self, f):
def inner(*args):
for i in range(self.retries + 1):
try:
text = f(*args)
return text
except Exception as e:
if type(e) is self.exception:
if i == (self.retries):
print("GIVING UP!!... Please retry after sometime ")
raise(e)
else:
print("OCR Extract failed.\n======== \nRETRY {}: \n======== \nseleeping for {} seconds before retry".format(i+1,self.sleep))
time.sleep(self.sleep)
else:
raise(e)
return inner
# ### Function Definitions
# In[52]:
def to_sans(numb):
"""
Input a valid integer. Function takes in an integer (in ASCII) and converts it to devanagari. returns a string.
"""
str_numb ="{}".format(numb)
x = [chr(ord(i) + 2358) for i in str_numb]
return ''.join(x)
def to_eng(sans_num):
"""
Input a valid devanagari number (as string).
Function converts it to english and returns the ascii equivalent (returns number).
"""
x = [chr(ord(i) - 2358) for i in sans_num]
return int(''.join(x))
def get_range(p):
"""
arguments is a range in format [start:end:step]
returns a generator object -
"""
regex = re.compile("([^:\s]*):?")
matches = regex.findall(p)
matches.pop()
matches.extend((None,None,None))
(start,end,step) = matches[:3]
#default missing values
if not step: step=1
if not start: start = 0
if not end: end = start
print("start={}; end={}; step={};".format(start,end,step))
return(int(start),int(end)+1,int(step))
# In[53]:
PageInfo = namedtuple('PageInfo',['url','pageno','actualpage','ocr_url'])
def generate_pages(docname,rng,displayno):
(start,end,step) = get_range(rng)
for i in range(start,end,step):
page = "पृष्ठम्:{}/{}".format(docname, to_sans(i))
ocr_url = get_url(docname,i,"jpg")
yield(PageInfo(page,displayno,i,ocr_url))
displayno = displayno + step
## helper functions for generating text from google ocr
def get_url_md5(docname):
m = hashlib.md5()
m.update(docname.encode("utf-8"))
md5 = m.hexdigest()
return "/{}/{}/".format(md5[0],md5[0:2])
def get_url(docname,pageno,image_format):
urlprefix="https://upload.wikimedia.org/wikipedia/commons/thumb/"
urlsuffix="page{}-1024px-".format(pageno)
md5text = get_url_md5(docname)
return"{}{}{}/{}.{}.{}".format(urlprefix,md5text,docname,urlsuffix,docname,image_format)
@retry(retries=5,sleep=25,exception=OCRException)
def url_ocr(image_uri):
req_url = "https://tools.wmflabs.org/ws-google-ocr/api.php?image={}&lang=sa".format(image_uri)
print(req_url)
try:
resp = requests.post(req_url,timeout=30)
resp.raise_for_status()
except requests.exceptions.HTTPError as errh:
print ("Http Error:",errh)
raise OCRException(errh)
except requests.exceptions.ConnectionError as errc:
print ("Error Connecting:",errc)
raise OCRException(errc)
except requests.exceptions.Timeout as errt:
print ("Timeout Error:",errt)
raise OCRException(errt)
except requests.exceptions.RequestException as err:
print ("OOps: Something Else",err)
raise OCRException(err)
if resp is None:
raise OCRException("Empty resp object returned..")
ocrjson = json.loads(resp.text)
ocrtext=ocrjson.get("text")
return ocrtext
# In[54]:
pages= generate_pages(pybot_docname,pybot_pages,1)
# In[ ]:
#for p in pages:
# print(p)
# In[ ]:
site = pywikibot.Site("sa","wikisource")
for p in pages:
print(p)
page = pywikibot.Page(site,p.url)
header="".format(pybot_username)
footer=""
if page.text =='':
ocrtext = url_ocr(p.ocr_url)
pagetext = "{}{}{}".format(header,ocrtext,footer)
page.text = pagetext
page.status=u"OCR done"
page.save(summary=u'OCR done.')
#page.save(u'OCR done')
else:
print("page text already exists. Skipping ocr")
#print(text)
print("=======================================================================")