# coding: utf-8 # In[8]: import pywikibot from collections import namedtuple import hashlib import re import time from base64 import b64encode import requests import json # In[50]: pybot_username="Venkateswaran raman" pybot_docname="ललितविस्तरः.pdf" pybot_pages="1:453" # In[51]: class OCRException(Exception): pass class retry(object): def __init__(self,retries,sleep,exception): self.retries = retries self.sleep = sleep self.exception = exception def __call__(self, f): def inner(*args): for i in range(self.retries + 1): try: text = f(*args) return text except Exception as e: if type(e) is self.exception: if i == (self.retries): print("GIVING UP!!... Please retry after sometime ") raise(e) else: print("OCR Extract failed.\n======== \nRETRY {}: \n======== \nseleeping for {} seconds before retry".format(i+1,self.sleep)) time.sleep(self.sleep) else: raise(e) return inner # ### Function Definitions # In[52]: def to_sans(numb): """ Input a valid integer. Function takes in an integer (in ASCII) and converts it to devanagari. returns a string. """ str_numb ="{}".format(numb) x = [chr(ord(i) + 2358) for i in str_numb] return ''.join(x) def to_eng(sans_num): """ Input a valid devanagari number (as string). Function converts it to english and returns the ascii equivalent (returns number). """ x = [chr(ord(i) - 2358) for i in sans_num] return int(''.join(x)) def get_range(p): """ arguments is a range in format [start:end:step] returns a generator object - """ regex = re.compile("([^:\s]*):?") matches = regex.findall(p) matches.pop() matches.extend((None,None,None)) (start,end,step) = matches[:3] #default missing values if not step: step=1 if not start: start = 0 if not end: end = start print("start={}; end={}; step={};".format(start,end,step)) return(int(start),int(end)+1,int(step)) # In[53]: PageInfo = namedtuple('PageInfo',['url','pageno','actualpage','ocr_url']) def generate_pages(docname,rng,displayno): (start,end,step) = get_range(rng) for i in range(start,end,step): page = "पृष्ठम्:{}/{}".format(docname, to_sans(i)) ocr_url = get_url(docname,i,"jpg") yield(PageInfo(page,displayno,i,ocr_url)) displayno = displayno + step ## helper functions for generating text from google ocr def get_url_md5(docname): m = hashlib.md5() m.update(docname.encode("utf-8")) md5 = m.hexdigest() return "/{}/{}/".format(md5[0],md5[0:2]) def get_url(docname,pageno,image_format): urlprefix="https://upload.wikimedia.org/wikipedia/commons/thumb/" urlsuffix="page{}-1024px-".format(pageno) md5text = get_url_md5(docname) return"{}{}{}/{}.{}.{}".format(urlprefix,md5text,docname,urlsuffix,docname,image_format) @retry(retries=5,sleep=25,exception=OCRException) def url_ocr(image_uri): req_url = "https://tools.wmflabs.org/ws-google-ocr/api.php?image={}&lang=sa".format(image_uri) print(req_url) try: resp = requests.post(req_url,timeout=30) resp.raise_for_status() except requests.exceptions.HTTPError as errh: print ("Http Error:",errh) raise OCRException(errh) except requests.exceptions.ConnectionError as errc: print ("Error Connecting:",errc) raise OCRException(errc) except requests.exceptions.Timeout as errt: print ("Timeout Error:",errt) raise OCRException(errt) except requests.exceptions.RequestException as err: print ("OOps: Something Else",err) raise OCRException(err) if resp is None: raise OCRException("Empty resp object returned..") ocrjson = json.loads(resp.text) ocrtext=ocrjson.get("text") return ocrtext # In[54]: pages= generate_pages(pybot_docname,pybot_pages,1) # In[ ]: #for p in pages: # print(p) # In[ ]: site = pywikibot.Site("sa","wikisource") for p in pages: print(p) page = pywikibot.Page(site,p.url) header="".format(pybot_username) footer="" if page.text =='': ocrtext = url_ocr(p.ocr_url) pagetext = "{}{}{}".format(header,ocrtext,footer) page.text = pagetext page.status=u"OCR done" page.save(summary=u'OCR done.') #page.save(u'OCR done') else: print("page text already exists. Skipping ocr") #print(text) print("=======================================================================")