import pywikibot
import re
from pywikibot import pagegenerators
site = pywikibot.Site('en', 'wikipedia')
resultpage = pywikibot.Page(site, u"User:Trialpears/Automatic biography short descriptions")
def extractfirst(text):
    i=0
    result=text
    result=re.sub("\[\[[Cc]ategory:[^\]]*]]","",result)
    result=re.sub("\[\[[Ff]ile:[^\]]*]]","",result)
    result=re.sub("\[\[[Ii]mage:[^\]]*]]","",result)
    result=re.sub("\n"," ",result)
    result=re.sub("==.*","",result)
    result=re.sub("^\s*'''[^']+'''","",result)
    result=re.sub("''+","",result)
    result=re.sub("\[\[([^\|\]\[]*\|)?([^\|\]\[]*)]]",r"\2",result)
    while i < 5:
        result=re.sub("{{[^{}]*}}","",result)
        result=re.sub("\([^\(\)]*\)","",result)
        result=re.sub("<ref[^<>]*>[^<>]*<\/ref>","",result)
        result=re.sub("<ref[^<>]*\/>","",result)
        result=re.sub("<!--[^<>]*-->","",result)
        i+=1
    result=re.sub("\n","",result)
    result=re.sub("  *"," ",result)
    result=re.sub("(^.*?[.!?](?=\s[A-Z]|$)).*",r"\1",result)
    result=re.sub("^\s*","",result)
    result=re.sub("\s*(?=,|\.)","",result)
    result=re.sub("\s*$","",result)
    return result
def extractdescription(text):
    result=text
    if re.match('^(("?[A-Z][a-z]*"?|(Jr\.)),? )*?(is|was) (a|an) ',result):
        result=re.sub('^(("?[A-Z][a-z]*"?|(Jr\.)),? )*?(is|was) (a|an) (.*)',r"\6",result)
        result=re.sub(',? who .*',"",result)
        result=re.sub(',? known for .*',"",result)
        result=re.sub(',? currently .*',"",result)
        result=re.sub(',? as well .*',"",result)
        result=re.sub(',? better known .*',"",result)
        result=re.sub("\.$","",result)
        result=re.sub('([a-zA-Z])', lambda x: x.groups()[0].upper(), result, 1)
        if 5 <= len(result) <= 40:
            if re.match("^(Afghan|Albanian|Algerian|Andorran|Angolan|Barbuda|Antiguan|Barbudan|Argentine|Armenian|Australian|Austrian|Azerbaijani|Azeri|Bahamas|Bahamian|Bahraini|Bengali|Barbadian|Belarusian|Belgian|Belizean|Beninese|Beninois|Bhutanese|Bolivian|Bosnian|Herzegovinian|Motswana|Botswanan|Brazilian|Bruneian|Bulgarian|Faso|Burkinabé|Burmese|Burundian|Verde|Cabo|Verdean|Cambodian|Cameroonian|Canadian|African|Chadian|Chilean|Chinese|Colombian|Comoran|Comorian|Congolese|Rican|Ivorian|Croatian|Cuban|Cypriot|Republic|Czech|Danish|Djiboutian|Dominican|Republic|Dominican|Timor|Timorese|Ecuadorian|Egyptian|Salvador|Salvadoran|Guinea|Equatorial|Guinean|Equatoguinean|Eritrean|Estonian|Ethiopian|Fijian|Finnish|French|Gabonese|Gambian|Georgian|German|Ghanaian|Gibraltar|Greek|Hellenic|Grenadian|Guatemalan|Guinean|Bissau|Guinean|Guyanese|Haitian|Honduran|Hungarian|Magyar|Icelandic|Indian|Indonesian|Iranian|Persian|Iraqi|Irish|Israeli|Italian|Coast|Ivorian|Jamaican|Japanese|Jordanian|Kazakhstani|Kazakh|Kenyan|Kiribati|Korea|North|Korean|Korea|South|Korean|Kuwaiti|Kyrgyzstani|Kyrgyz|Kirgiz|Kirghiz|Lao|Laotian|Latvian|Lettish|Lebanese|Basotho|Liberian|Libyan|Liechtensteiner|Lithuanian|Luxembourg|Luxembourgish|Macedonian|Malagasy|Malawian|Malaysian|Maldivian|Malian|Malinese|Maltese|Islands|Marshallese|Martiniquais|Martinican|Mauritanian|Mauritian|Mexican|Federated|States|of|Micronesian|Moldovan|Monégasque|Monacan|Mongolian|Montenegrin|Moroccan|Mozambican|Namibian|Nauruan|Nepali|Nepalese|Dutch|Netherlandic|Zealand|New|Zealand|NZ|Zelanian|Nicaraguan|Nigerien|Nigerian|Mariana|Islands|Northern|Marianan|Norwegian|Omani|Pakistani|Palauan|Palestinian|Panamanian|New|Guinea|Papua|New|Guinean|Papuan|Paraguayan|Peruvian|Filipino|Philippine|Polish|Portuguese|Rico|Puerto|Rican|Qatari|Romanian|Russian|Rwandan|Kitts|and|Nevis|Kittitian|or|Nevisian|Lucia|Saint|Lucian|Vincent|and|the|Grenadines|Saint|Vincentian|Vincentian|Samoan|Marino|Sammarinese|ão|Tomé|and|Príncipe|São|Toméan|Arabia|Saudi|Saudi|Arabian|Senegalese|Serbian|Seychellois|Leone|Sierra|Leonean|Singapore|Singaporean|Slovak|Slovenian|Slovene|Islands|Solomon|Island|Somali|Africa|South|African|Sudan|South|Sudanese|Spanish|Lanka|Sri|Lankan|Sudanese|Surinamese|Swazi|Swedish|Swiss|Syrian|Tajikistani|Tanzanian|Thai|Leste|Timorese|Togolese|Tokelauan|Tongan|and|Tobago|Trinidadian|or|Tobagonian|Tunisian|Turkish|Turkmen|Tuvaluan|Ugandan|Ukrainian|Arab|Emirates|Emirati|Emirian|Emiri|Kingdom|of|Great|Britain|and|Northern|Ireland|UK|British|States|of|America|United|States|U.S.|American|Uruguayan|Uzbekistani|Uzbek|Vanuatu|Vanuatuan|City|State|Vatican|Venezuelan|Vietnamese|Yemeni|Zambian|Zimbabwean)",result):
                return result
    return False
counter = 1
for page in pywikibot.pagegenerators.SearchPageGenerator('incategory:"Living people"'):
    counter += 1
    if extractdescription(extractfirst(page.text)) is not False:
        text=extractfirst(page.text)
        print("\n*"+str(page)+":" + extractdescription(extractfirst(page.text)))
        resultpage.text += "\n*"+str(page)+": " + extractdescription(extractfirst(page.text))
    if counter % 1000 is 0:
        print("save")
        resultpage.save("Testing bot things")
*[[en:Adam Fairclough]]:British historian of the United States

*[[en:Adrian Blincoe]]:New Zealand middle distance runner

*[[en:Al Madril]]:American retired professional wrestler

*[[en:Alassane Ouattara]]:Ivorian politician

*[[en:Alexander Kudryavtsev]]:Russian professional tennis player

*[[en:Amber Hearn]]:New Zealand association footballer

*[[en:Andrew Nicholson (equestrian)]]:New Zealand horseman

*[[en:Arved Fuchs]]:German explorer
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/srv/paws/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    378                 # Python 2.7, use buffering of HTTP responses
--> 379                 httplib_response = conn.getresponse(buffering=True)
    380             except TypeError:

TypeError: getresponse() got an unexpected keyword argument 'buffering'

During handling of the above exception, another exception occurred:

KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-1-39b8caac7c85> in <module>
     47 for page in pywikibot.pagegenerators.SearchPageGenerator('incategory:"Living people"'):
     48     counter += 1
---> 49     if extractdescription(extractfirst(page.text)) is not False:
     50         text=extractfirst(page.text)
     51         print("\n*"+str(page)+":" + extractdescription(extractfirst(page.text)))

/srv/paws/pwb/pywikibot/page.py in text(self)
    623         if not hasattr(self, '_text') or self._text is None:
    624             try:
--> 625                 self._text = self.get(get_redirect=True)
    626             except pywikibot.NoPage:
    627                 # TODO: what other exceptions might be returned?

/srv/paws/pwb/pywikibot/tools/__init__.py in wrapper(*__args, **__kw)
   1736                              cls, depth)
   1737                     del __kw[old_arg]
-> 1738             return obj(*__args, **__kw)
   1739 
   1740         if not __debug__:

/srv/paws/pwb/pywikibot/page.py in get(self, force, get_redirect, sysop)
    478             del self.latest_revision_id
    479         try:
--> 480             self._getInternals(sysop)
    481         except pywikibot.IsRedirectPage:
    482             if not get_redirect:

/srv/paws/pwb/pywikibot/page.py in _getInternals(self, sysop)
    508         if self._latest_cached_revision() is None:
    509             try:
--> 510                 self.site.loadrevisions(self, content=True, sysop=sysop)
    511             except (pywikibot.NoPage, pywikibot.SectionError) as e:
    512                 self._getexception = e

/srv/paws/pwb/pywikibot/tools/__init__.py in wrapper(*__args, **__kw)
   1736                              cls, depth)
   1737                     del __kw[old_arg]
-> 1738             return obj(*__args, **__kw)
   1739 
   1740         if not __debug__:

/srv/paws/pwb/pywikibot/site.py in loadrevisions(self, page, content, revids, startid, endid, starttime, endtime, rvdir, user, excludeuser, section, sysop, step, total, rollback)
   4146             rvgen.set_maximum_items(-1)  # suppress use of rvlimit parameter
   4147 
-> 4148         for pagedata in rvgen:
   4149             if not self.sametitle(pagedata['title'],
   4150                                   page.title(with_section=False)):

/srv/paws/pwb/pywikibot/data/api.py in __iter__(self)
   2981         """Yield results."""
   2982         self._previous_dicts = {}
-> 2983         for result in super(PropertyGenerator, self).__iter__():
   2984             yield result
   2985         for result in self._previous_dicts.values():

/srv/paws/pwb/pywikibot/data/api.py in __iter__(self)
   2805                 prev_limit, new_limit, previous_result_had_data)
   2806             if not hasattr(self, 'data'):
-> 2807                 self.data = self.request.submit()
   2808             if not self.data or not isinstance(self.data, dict):
   2809                 pywikibot.debug(

/srv/paws/pwb/pywikibot/data/api.py in submit(self)
   1983                                                                    paramstring)
   1984             rawdata, use_get = self._http_request(use_get, uri, body, headers,
-> 1985                                                   paramstring)
   1986             if rawdata is None:
   1987                 continue

/srv/paws/pwb/pywikibot/data/api.py in _http_request(self, use_get, uri, body, headers, paramstring)
   1742                 site=self.site, uri=uri,
   1743                 method='GET' if use_get else 'POST',
-> 1744                 body=body, headers=headers)
   1745         except Server504Error:
   1746             pywikibot.log('Caught HTTP 504 error; retrying')

/srv/paws/pwb/pywikibot/tools/__init__.py in wrapper(*__args, **__kw)
   1736                              cls, depth)
   1737                     del __kw[old_arg]
-> 1738             return obj(*__args, **__kw)
   1739 
   1740         if not __debug__:

/srv/paws/pwb/pywikibot/comms/http.py in request(site, uri, method, params, body, headers, data, **kwargs)
    321 
    322     baseuri = site.base_url(uri)
--> 323     r = fetch(baseuri, method, params, body, headers, **kwargs)
    324     site.throttle.retry_after = int(r.response_headers.get('retry-after', 0))
    325     return r.text

/srv/paws/pwb/pywikibot/comms/http.py in fetch(uri, method, params, body, headers, default_error_handling, use_fake_user_agent, data, **kwargs)
    519             headers['user-agent'] = fake_user_agent()
    520 
--> 521     request = _enqueue(uri, method, params, body, headers, **kwargs)
    522     # if there's no data in the answer we're in trouble
    523     assert request._data is not None

/srv/paws/pwb/pywikibot/comms/http.py in _enqueue(uri, method, params, body, headers, data, **kwargs)
    475     request = threadedhttp.HttpRequest(
    476         uri, method, params, body, all_headers, callbacks, **kwargs)
--> 477     _http_process(session, request)
    478     return request
    479 

/srv/paws/pwb/pywikibot/comms/http.py in _http_process(session, http_request)
    388                                    headers=headers, auth=auth, timeout=timeout,
    389                                    verify=not ignore_validation,
--> 390                                    **http_request.kwargs)
    391     except Exception as e:
    392         http_request.data = e

/srv/paws/lib/python3.6/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    531         }
    532         send_kwargs.update(settings)
--> 533         resp = self.send(prep, **send_kwargs)
    534 
    535         return resp

/srv/paws/lib/python3.6/site-packages/requests/sessions.py in send(self, request, **kwargs)
    644 
    645         # Send the request
--> 646         r = adapter.send(request, **kwargs)
    647 
    648         # Total elapsed time of the request (approximately)

/srv/paws/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    447                     decode_content=False,
    448                     retries=self.max_retries,
--> 449                     timeout=timeout
    450                 )
    451 

/srv/paws/lib/python3.6/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    601                                                   timeout=timeout_obj,
    602                                                   body=body, headers=headers,
--> 603                                                   chunked=chunked)
    604 
    605             # If we're going to release the connection in ``finally:``, then

/srv/paws/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    381                 # Python 3
    382                 try:
--> 383                     httplib_response = conn.getresponse()
    384                 except Exception as e:
    385                     # Remove the TypeError from the exception chain in Python 3;

/usr/lib/python3.6/http/client.py in getresponse(self)
   1329         try:
   1330             try:
-> 1331                 response.begin()
   1332             except ConnectionError:
   1333                 self.close()

/usr/lib/python3.6/http/client.py in begin(self)
    295         # read until we get a non-100 response
    296         while True:
--> 297             version, status, reason = self._read_status()
    298             if status != CONTINUE:
    299                 break

/usr/lib/python3.6/http/client.py in _read_status(self)
    256 
    257     def _read_status(self):
--> 258         line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    259         if len(line) > _MAXLINE:
    260             raise LineTooLong("status line")

/usr/lib/python3.6/socket.py in readinto(self, b)
    584         while True:
    585             try:
--> 586                 return self._sock.recv_into(b)
    587             except timeout:
    588                 self._timeout_occurred = True

/usr/lib/python3.6/ssl.py in recv_into(self, buffer, nbytes, flags)
   1010                   "non-zero flags not allowed in calls to recv_into() on %s" %
   1011                   self.__class__)
-> 1012             return self.read(nbytes, buffer)
   1013         else:
   1014             return socket.recv_into(self, buffer, nbytes, flags)

/usr/lib/python3.6/ssl.py in read(self, len, buffer)
    872             raise ValueError("Read on closed or unwrapped SSL socket.")
    873         try:
--> 874             return self._sslobj.read(len, buffer)
    875         except SSLError as x:
    876             if x.args[0] == SSL_ERROR_EOF and self.suppress_ragged_eofs:

/usr/lib/python3.6/ssl.py in read(self, len, buffer)
    629         """
    630         if buffer is not None:
--> 631             v = self._sslobj.read(len, buffer)
    632         else:
    633             v = self._sslobj.read(len)

KeyboardInterrupt: