import pywikibot 
import re, datetime
from pywikibot import pagegenerators
ROOT="-TʼEʼ"

site = pywikibot.Site('en','wiktionary')
page = pywikibot.Page(site,"User:Julien_Daux/"+ROOT)
m_etym=re.compile("=== *Etymology") 
m_glosses=re.compile("^ *# *'''(.*)'''")
m_themes=re.compile("^ *## *\(([^\)]*)\)")

glosses=[] 
themes={} 
gloss="" 
text= page.text.split("\n")
for line in text:
  #print(line) 
  p=m_etym.search(line)
  if p:
       print("new etym") 
       continue
  
  p=m_glosses.search(line)
  if p:
    gloss=p.group(1)
    glosses.append(gloss)
    themes[gloss] =[] 
    print("  "+gloss) 
    continue
    
  p=m_themes.search(line)
  if p:
    theme=p.group(1)
    themes[gloss].append(theme)
    print("    "+theme) 
    continue



print(datetime.datetime.now().strftime("%X")) 
m=re.compile( "nv-prefixes *\|([^\}]+)\}\}")
m2=re.compile("nv-paradigm *\|([^\}]+)\}\}")
m3=re.compile("nv-link-to-root *\|([^\}]+)\}\}")
#m4=re.compile("\* \{\{l\|nv\|([^\}]+)\}\}")
#m5=re.compile("\* \[\[([^\]]+)\]\]")
m6=re.compile("nv-verbtable(-ext)* *\|[^|]+\|([^|]+)\|([^|]+)\|")
m7=re.compile("\n#(.*)")

def rework(s) :
  ret=s
  ret=ret.replace("something unspecified", "sth") 
  ret=ret.replace("something","sth")

  ret=ret.replace("he/she/it","he")
  ret=ret.replace("he/she","he")
  ret=ret.replace("him/her/it","him")
  ret=ret.replace("him/her","him")
  ret=ret.replace("his/her/its","his") 
  ret=ret.replace("his/her","his")
  ret=ret.replace("such as","as")
  ret=ret.replace("object","obj")
  ret=ret.replace("(2 actors)","(2)")
  ret=ret.replace("(3+ actors)","(3+)")
  # remove italics between parentheses
  m=re.compile("\(\'\'.*\'\)") 
  ret=m.sub('',ret)
    
  # remove qualifiers 
  m=re.compile("\{\{\qualifier\|[^\}]*\}\}") 
  ret=m.sub('',ret)

  # remove first parentheses  
  m=re.compile("^\([^\)]*\)") 
  ret=m.sub('',ret.strip())
    
  # reduce examples
  m=re.compile("\(([^\)]*) or [^\)]*\)") 
  ret=m.sub(r'(\1)',ret.strip())
    
  m=re.compile("\(([^,\)]*),[^\)]*\)") 
  ret=m.sub(r'(\1)',ret.strip())  
    
  # strip markup
  m=re.compile("\[\[([^\|\]]*\|)?([^\]]*)\]\]") 
  ret=m.sub(r'\2',ret.strip())

  ret = ret.strip()
  return ret

def cut50(s):
  if len(s) <=50:
    return s

  # retrieve last paren clause 
  paren=""
  ret=s
  m=re.compile("(\([^\)]*\))$")
  p=m.search(s)
  if p:
       paren=" "+p.group(1)
       ret=s[0:p.span()[0]]
  cut=(len(paren) >25) and 25 or 50
  m=re.compile("(.*),(.*)")
  p=m.search(ret[0:cut] )
  if p:
       return p.group(1).strip()+paren 
  else:
       return ret.split(',')[0].strip()+paren


    
class Verb:
  def __init__(self, verb):
    self.verb = verb
    self.prefixes = []
    self.slots    = [[],[],[],[],[],[],[],[],[],[],[]]
    self.paradigm = ["", "" ]
    self.count =0
    self.root = ""
    self.gloss =""
    self.theme=""
    self.definition =""
    self.firstperson=""
    self.seealso = []
    
  def themedef(self, themes) :
    for theme in themes:
       prefixes=theme.split("-")
       classifiers=prefixes[-1].split("/") 
       #print (classifiers, self.slots[9][0],self.slots[9][0] in classifiers) 
       ok =(self.slots[9][0] in classifiers) 
       for p in prefixes[0:len(prefixes)-1]:
          #print(p) 
          ok &= (p in self.slots[4] or p in self.slots[6]) 
            
       if ok:
          self.theme =theme #print("Theme "+theme+" matches "+self.verb)
       #else:
       #   print("Theme "+theme+" matches "+self.verb)
verbList=[]
    
unclassified=[] 
seealso={} 
prefixes=set() 
count=0

for gloss in glosses:
 cat = pywikibot.Category(site,'Category:Navajo terms belonging to the root '+ROOT+ ' ('+gloss+')')

 gen = pagegenerators.CategorizedPageGenerator(cat) 
 for page in gen:
  print(ROOT, gloss, page.title()) 
  text = page.text
  verb = page.title()
  v=Verb(verb)

    
  p=m.search(text[0:200])
  if p :
    print(datetime.datetime.now().strftime("%X"),count, page.title())
    g=p.group(1) 
    s=g.split('|')

    verbList.append(v)
    v.count=count
    v.prefixes=s
    
    prefixes|=set(s)
  else:
    unclassified.append(verb)

  p2=m2.search(text[100:])
  if p2 :
    g=p2.group(1)
    s=g.split('|')


    v.paradigm=s
    
  p3=m3.search(text[0:300])
  if p3 :
    g=p3.group(1)
    s=g.split('|')
    v.root= s[0].strip()
    v.gloss = s[1].strip() 

    


  p6=m6.search(text[200:])
  if p6 :
      #print(p6)  
      g=p6.group(2).strip() 
      if len(g)<3 or g=="&nbsp;":
        g=p6.group(3).strip()

      v.firstperson= g
    
  for p7 in re.finditer(m7, text):
    g=p7.group(1);
    v.definition=cut50(rework(g))
    break
               
### prefixes classification
prefixslot={}
for p in prefixes:
  pp = p.split('-')
  p1=p2=" "
  if len(pp) >= 1:
        p1=pp[0]
  if len(pp) >= 2:
        p2 = pp[1]
  if p=="-" or p1 in ('l', 'ł', 'd', 'L'):
    slot = 9
  elif p in ('a-away', 'da-death', 'di-disj') :
    slot = 1
  elif p in  ('ni', 'si', 'yi-semel', 'yi-trans', 'yi') :
    slot = 7
  elif p2 in ('3s'):
    slot = 7
  elif p1 in ('di', 'hi','ni', 'ní', 'yi', 'yíní', 'dini','yini', 'sh', 'dzi'  ) :
    slot = 6
  elif p in ('a', 'ho', '3o', 'ah', '4i') :
    slot = 4
  elif p in ('3s') :
    slot = 8
  elif p in ('b', 'ał', 'ah-disj') :
    slot = 0
  else:
    slot = 1
  prefixslot[p] =slot 


for verb in verbList:    
    for j in range(len(verb.prefixes)):
       vp = verb.prefixes[j]
       slot = prefixslot[vp]
        
       # if POS IV object prefix
       # but this is first prefix
       # and next prefix is POS I
       # then this is POS 0
       if slot == 4 and j==0 and prefixslot[verb.prefixes[1]] ==1 :
            slot = 0
            
       #formatting
       vpp=vp.split('-')[0]     
       if vp in('-') :
          vpp='Ø'
       elif vp in ('3o'):
          vpp='y' 
       elif vp in ('4i'):
          vpp='ʼa'
       elif vp =='L' :
          vpp='ł'
       elif vp[0] in ('a','á') and vp!='aʼ':
          vpp='ʼ'+vpp
       elif slot ==7 and len(vp.split('-')) ==3:
          form=vp.split('-')[2]
          if form in ("í","íí"):
               form=form.replace("í","◌́")
          vpp= vpp+" (" +form+")"
        
       verb.slots[slot].append(vpp)
    
    #print(verb.verb,verb.gloss,themes[verb.gloss]) 
    verb.themedef(themes[verb.gloss]) 
       
for verb in verbList:
    if (len(verb.paradigm)==0):
        continue
    para=verb.paradigm[0]
    p=para.lower().strip()
    if p == "momentaneous":
      para="MOM"
    elif p == "neuter imperfective":
      para="N-IMP"
    elif p == "neuter perfective":
      para="N-PERF" 
    elif p == "neuter repetitive":
      para="N-REP"
    elif p == "neuter absolute":
      para="N-ABS"
    elif p == "repetitive":
      para="REP"
    elif p == "conclusive":
      para="CONCL"
    elif p == "continuative":
      para="CONT" 
    elif p == "conative":
      para="CONAT"
    elif p == "semelfactive":
      para="SEM"
    elif p == "transitional":
      para="TRANS"
    elif p == "durative":
      para="DUR"
    verb.paradigm[0]=para
    
    
###### print table

s="{{User:Julien_Daux/Template:nv-root-theme-header}}\n" 

for verb in verbList:
 s+="{{User:Julien_Daux/Template:nv-root-theme-row|"

 s+=verb.gloss+"|1|"+verb.theme+"|1|"+verb.verb+"|"+verb.firstperson

 for i in range(0,10):
   if i in (2,3,5, 8):
    continue
   
   s +="|"
   j=0
   for vp in verb.slots[i]:
      s+= (j>0 and "-" or "") + vp
      j+=1
 s+="|"+verb.paradigm[0] +"|"+verb.paradigm[1]+"|"+verb.definition+"}} \n"

s+="|}"

page=pywikibot.Page(site,"User:Julien_Daux/Navajo_verbs_for_roots") 
page.text=s
page.save()
new etym
  be
    Ø
    l
    ł
new etym
  propel SSO/AnO
    ł
  hop
    hi-d
04:46:43
-TʼEʼ be aheełtʼé
04:46:43 0 aheełtʼé
-TʼEʼ be yeełtʼé
04:46:43 0 yeełtʼé
-TʼEʼ be ákótʼé
04:46:43 0 ákótʼé
-TʼEʼ be ánéeltʼeʼ
04:46:43 0 ánéeltʼeʼ
-TʼEʼ be átʼé
04:46:43 0 átʼé
-TʼEʼ propel SSO/AnO haiłtʼeʼ
04:46:43 0 haiłtʼeʼ
-TʼEʼ propel SSO/AnO iiłtʼeʼ
04:46:43 0 iiłtʼeʼ
-TʼEʼ propel SSO/AnO yiłtʼeʼ
04:46:43 0 yiłtʼeʼ
Sleeping for 8.8 seconds, 2016-11-08 04:46:44
Page [[wiktionary:en:User:Julien Daux/Navajo verbs for roots]] saved