Extracting pages containing keyword from a dump

This notebook extracts the pages of a given dump containing a keyword from a set of given ones.

import mwxml
import re
import copy
import csv

Define paths to visit

import glob

paths = glob.glob('/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current*.bz2')
paths.remove('/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current.xml.bz2')
paths
['/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current17.xml-p13039268p13693071.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current6.xml-p565314p892912.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current21.xml-p21222158p22722158.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current19.xml-p17620543p18754735.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current8.xml-p1268692p1791079.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current26.xml-p38067203p39567203.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current27.xml-p42663462p44163462.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current27.xml-p57663462p59163462.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current23.xml-p26823661p28323661.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current20.xml-p20254736p21222156.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current2.xml-p30304p88444.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current27.xml-p54663462p56163462.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current25.xml-p33952816p35452816.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current24.xml-p30503451p32003451.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current27.xml-p45663462p47163462.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current17.xml-p11539268p13039268.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current15.xml-p7744803p9244803.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current27.xml-p59163462p60663462.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current25.xml-p36952816p38067202.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current27.xml-p47163462p48663462.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current4.xml-p200511p352689.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current25.xml-p35452816p36952816.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current19.xml-p16120543p17620543.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current10.xml-p2336425p3046511.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current26.xml-p41067203p42567203.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current18.xml-p15193074p16120542.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current11.xml-p3046514p3926861.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current27.xml-p44163462p45663462.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current27.xml-p51663462p53163462.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current27.xml-p56163462p57663462.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current27.xml-p53163462p54663462.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current13.xml-p5040438p6197594.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current15.xml-p9244803p9518048.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current27.xml-p50163462p51663462.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current14.xml-p6197598p7697598.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current24.xml-p32003451p33503451.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current24.xml-p33503451p33952815.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current16.xml-p9518050p11018050.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current22.xml-p23927984p25427984.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current7.xml-p892914p1268691.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current23.xml-p29823661p30503449.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current14.xml-p7697598p7744799.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current18.xml-p13693074p15193074.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current16.xml-p11018050p11539266.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current1.xml-p10p30303.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current12.xml-p3926863p5040436.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current27.xml-p48663462p50163462.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current22.xml-p25427984p26823660.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current23.xml-p28323661p29823661.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current26.xml-p42567203p42663461.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current20.xml-p18754736p20254736.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current9.xml-p1791080p2336422.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current27.xml-p60663462p60822713.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current5.xml-p352690p565313.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current26.xml-p39567203p41067203.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current21.xml-p22722158p23927983.bz2',
 '/public/dumps/public/enwiki/20190520/enwiki-20190520-pages-meta-current3.xml-p88445p200507.bz2']
paths = ['/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-current1.xml-p10p30303.bz2', '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-current2.xml-p30304p88444.bz2']
keywords_raw1 = "flood,floods,flooding,flooded,inundation"
keywords = keywords_raw1.split(',')
print(keywords)
['flood', 'floods', 'flooding', 'flooded', 'inundation']

Find keywords function

Returns a boolean defining whether a keyword was found or not

def find_keywords(text):
    if any(k in text for k in keywords):
        return True
    else:
        return False

XML Processor on path

def process_dump(dump, path):
    i = 0
    for page in dump:
        output = []
        page_name = re.findall('title=(.+?),', str(page))[0].replace("'", '')
        if (page.namespace == 0):
            for revision in page:
                text = re.sub('{[^}]+}}', '', revision.text)
                text_split = re.split('(?<=[.?\n])[ [<\n]', text)
                sentences = []
                for i in text_split:
                    sentences+= list(filter(bool, i.splitlines()))
                for sentence in sentences:
                    if (find_keywords(sentence) == True):
                        elem = {}
                        elem['page'] = page_name
                        elem['sentence'] = sentence
                        output.append(elem)
                if (len(output)):
                    yield output

OK. Now that everything is defined, it's time to run the code. mwxml has a map() function that applied the process_dump function each of the XML dump file in paths -- in parallel -- using python's multiprocessing library and collects all of the yielded values in a generator. As the code below demonstrates, it's easy to collect this output and write it to a new output file or print it out to the console (not recommended for large amounts of output).

count = 0
dict_keys = ['page', 'sentence']

with open('./pages_first_step.csv', 'w', newline='') as myfile:
    wr = csv.DictWriter(myfile, dict_keys)
    wr.writeheader()
    
    for info in mwxml.map(process_dump, paths):
        for i in info:
            wr.writerow(i)
            count += 1
        
print("%d lines were written to the file" %count)
Mapper 3: An error occured while processing /public/dumps/public/enwiki/20190520/enwiki-201905
Mapper 3: Traceback (most recent call last):
  File "/srv/paws/lib/python3.6/site-packages/para/map.py", line 142, in run
    for value in self.process(item):
  File "/srv/paws/lib/python3.6/site-packages/mwxml/map/map.py", line 47, in process_path
    yield from process(dump, path)
  File "<ipython-input-70-3ac203806d4e>", line 7, in process_dump
    for revision in page:
  File "/srv/paws/lib/python3.6/site-packages/mwxml/iteration/page.py", line 32, in __iter__
    for revision in self.__revisions:
  File "/srv/paws/lib/python3.6/site-packages/mwxml/iteration/page.py", line 44, in load_revisions
    yield Revision.from_element(first_revision)
  File "/srv/paws/lib/python3.6/site-packages/mwxml/iteration/revision.py", line 40, in from_element
    user = User.from_element(sub_element)
  File "/srv/paws/lib/python3.6/site-packages/mwxml/iteration/user.py", line 19, in from_element
    values = consume_tags(cls.TAG_MAP, element)
  File "/srv/paws/lib/python3.6/site-packages/mwxml/iteration/util.py", line 7, in consume_tags
    value_map[tag_name] = tag_map[tag_name](sub_element)
  File "/srv/paws/lib/python3.6/site-packages/mwxml/iteration/user.py", line 12, in <lambda>
    'id': lambda e: int(e.text),
TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-74-aa92324fe59c> in <module>
      6     wr.writeheader()
      7 
----> 8     for info in mwxml.map(process_dump, paths):
      9         for i in info:
     10             wr.writerow(i)

/srv/paws/lib/python3.6/site-packages/mwxml/map/map.py in map(process, paths, threads)
     47         yield from process(dump, path)
     48 
---> 49     yield from para.map(process_path, paths, mappers=threads)

/srv/paws/lib/python3.6/site-packages/para/map.py in _map_many_items(process, items, mappers, output_queue_size)
    100                 yield value
    101             else:
--> 102                 raise error
    103         except KeyboardInterrupt:
    104             logger.warning("KeyboardInterrupt detected.  Finishing...")

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
prueba = 'Anarchism does not offer a fixed body of doctrine from a single particular worldview. ¿Probando? However, {{sfn|Marshall|1993|pp=14–17}}. Many types and traditions of anarchism exist, not all of which are mutually exclusive.{{sfn|Sylvan|2007|p=262}} [[Anarchist schools of thought]] can differ fundamentally, supporting anything from extreme [[individualism]] to complete [[collectivism]]. For example google.com. People started looting stores and warehouses in order to get supplies, mainly food."<ref name="aardappeloproer"></ref>'
#sentences
#sentences = re.findall('.*?[.!\?]', prueba)
#sentences = re.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)[\W.]', prueba)
#sentences = prueba.split('.')
#sentences
['Anarchism does not offer a fixed body of doctrine from a single particular worldview.',
 '¿Probando?',
 'However, {{sfn|Marshall|1993|pp=14–17}}.',
 'Many types and traditions of anarchism exist, not all of which are mutually exclusive.',
 '{sfn|Sylvan|2007|p=262}} [[Anarchist schools of thought]] can differ fundamentally, supporting anything from extreme [[individualism]] to complete [[collectivism]].',
 'For example google.com.',
 'People started looting stores and warehouses in order to get supplies, mainly food.',
 '<ref name="aardappeloproer"></ref>']