Extracting pages containing keyword from a dump

This notebook extracts the pages of a given dump containing a keyword from a set of given ones.

! pip install -U spacy
#! python -m spacy download es_core_news_sm
! python -m spacy download en_core_web_sm
Requirement already up-to-date: spacy in /srv/paws/lib/python3.6/site-packages
Requirement already up-to-date: jsonschema<3.0.0,>=2.6.0 in /srv/paws/lib/python3.6/site-packages (from spacy)
Requirement already up-to-date: murmurhash<1.1.0,>=0.28.0 in /srv/paws/lib/python3.6/site-packages (from spacy)
Requirement already up-to-date: numpy>=1.15.0 in /srv/paws/lib/python3.6/site-packages (from spacy)
Requirement already up-to-date: preshed<2.1.0,>=2.0.1 in /srv/paws/lib/python3.6/site-packages (from spacy)
Requirement already up-to-date: requests<3.0.0,>=2.13.0 in /srv/paws/lib/python3.6/site-packages (from spacy)
Requirement already up-to-date: cymem<2.1.0,>=2.0.2 in /srv/paws/lib/python3.6/site-packages (from spacy)
Requirement already up-to-date: srsly<1.1.0,>=0.0.5 in /srv/paws/lib/python3.6/site-packages (from spacy)
Requirement already up-to-date: plac<1.0.0,>=0.9.6 in /srv/paws/lib/python3.6/site-packages (from spacy)
Requirement already up-to-date: blis<0.3.0,>=0.2.2 in /srv/paws/lib/python3.6/site-packages (from spacy)
Requirement already up-to-date: wasabi<1.1.0,>=0.2.0 in /srv/paws/lib/python3.6/site-packages (from spacy)
Requirement already up-to-date: thinc<7.1.0,>=7.0.2 in /srv/paws/lib/python3.6/site-packages (from spacy)
Requirement already up-to-date: urllib3<1.25,>=1.21.1 in /srv/paws/lib/python3.6/site-packages (from requests<3.0.0,>=2.13.0->spacy)
Requirement already up-to-date: certifi>=2017.4.17 in /srv/paws/lib/python3.6/site-packages (from requests<3.0.0,>=2.13.0->spacy)
Requirement already up-to-date: chardet<3.1.0,>=3.0.2 in /srv/paws/lib/python3.6/site-packages (from requests<3.0.0,>=2.13.0->spacy)
Requirement already up-to-date: idna<2.9,>=2.5 in /srv/paws/lib/python3.6/site-packages (from requests<3.0.0,>=2.13.0->spacy)
Requirement already up-to-date: tqdm<5.0.0,>=4.10.0 in /srv/paws/lib/python3.6/site-packages (from thinc<7.1.0,>=7.0.2->spacy)
Requirement already satisfied: en_core_web_sm==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#egg=en_core_web_sm==2.1.0 in /srv/paws/lib/python3.6/site-packages
✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
import mwxml
import re
import copy
import csv

Define paths to visit

import glob

paths = glob.glob('/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-*.bz2')
paths
['/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p22580p24305.bz2',
 '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p9238p11043.bz2',
 '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p27785p29645.bz2',
 '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p4055p5616.bz2',
 '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p2049p4054.bz2',
 '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p14389p15941.bz2',
 '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p26093p27784.bz2',
 '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p19358p20988.bz2',
 '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p11044p12715.bz2',
 '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p10p2048.bz2',
 '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p7432p9237.bz2',
 '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p29646p30303.bz2',
 '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p15942p17726.bz2',
 '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p5617p7431.bz2',
 '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p12716p14388.bz2',
 '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p24306p26092.bz2',
 '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p20989p22579.bz2',
 '/public/dumps/public/enwiki/20190401/enwiki-20190401-pages-meta-history1.xml-p17727p19357.bz2']
keywords_raw1 = "flood,floods,flooding,flooded,inundation"
keywords_raw2 = "waterlogged,waterlogging,heavy rain,heavy rains,extreme rainfall,sewage overflow,river overflow,rain overflow,water overflow"
keywords = keywords_raw1.split(',')
keywords_additional = keywords_raw2.split(',')
keywords_total = keywords + keywords_additional

print(keywords)
['flood', 'floods', 'flooding', 'flooded', 'inundation']

Find keywords function

Returns a boolean defining whether a keyword was found or not

def find_keywords(text):
    if any(k in text for k in keywords):
        return 1
    elif any(k in text for k in keywords_additional):
        return 2
    return False
def find_sentence(paragraph):
    
    output = []
    
    doc = nlp(paragraph)
    sentences = [sent.string.strip() for sent in doc.sents]
    
    for j in sentences:
        sent_type = find_keywords(j)
        if (sent_type == 1):
            for i in keywords:
                kw = re.search(r'\b{}\b'.format(i), j, re.IGNORECASE)
                if (kw):
                    kw = kw.group()
                    info = get_flood_info(j)
                    data = {}
                    data['keyword'] = kw
                    data['sentence'] = j
                    data['additional'] = info
                    data['type'] = 1
                    output.append(data)
                    break
        elif (sent_type == 2):
            data = {}
            data['sentence'] = j
            data['type'] = 2
            output.append(data)

    return output
def get_flood_info(sentence):
    doc = nlp(sentence)
    data = {}
    data['dates'] = []
    data['locs'] = []
    data['entities'] = []
    
    for ent in doc.ents:
        if ent.label_ == 'DATE':
            data['dates'].append(ent.text)
        elif ent.label_ == 'GPE':
            data['locs'].append(ent.text)
        elif ent.label_ == 'LOC':
            data['entities'].append(ent.text)

    return data

XML Processor on path

def process_dump(dump, path):
    output = []
    for page in dump:
        sentences = []
        output = []
        page_name = re.findall('title=(.+?),', str(page))[0].replace("'", '')
        if (page.namespace == 0):
            for revision in page:
                paragraphs = re.split('\n', str(revision.text))
                for i in paragraphs:
                    for kw in keywords_total:
                        if (re.search(r'\b{}\b'.format(kw), i, re.IGNORECASE)):
                            info = find_sentence(i)
                            output = []
                            for elem in info:
                                elem['page'] = page_name
                                elem['revision.timestamp'] = revision.timestamp
                                if (elem['sentence'] not in sentences):
                                    output.append(elem)
                                    #print(output)
                                    sentences.append(elem['sentence'])
                            if (len(output)):
                                yield output

OK. Now that everything is defined, it's time to run the code. mwxml has a map() function that applied the process_dump function each of the XML dump file in paths -- in parallel -- using python's multiprocessing library and collects all of the yielded values in a generator. As the code below demonstrates, it's easy to collect this output and write it to a new output file or print it out to the console (not recommended for large amounts of output).

count = 0
dict_keys = ['keyword', 'type', 'additional', 'revision.timestamp', 'page', 'sentence']

with open('./pages.csv', 'w', newline='') as myfile:
    wr = csv.DictWriter(myfile, dict_keys)
    wr.writeheader()
    
    for info in mwxml.map(process_dump, paths):
        for i in info:            
            wr.writerow(i)
            count += 1
        
print("%d lines were written to the file" %count)
KeyboardInterrupt detected.  Finishing...
Process Mapper 0:
20 lines were written to the file
Process Mapper 1:
Traceback (most recent call last):
Process Mapper 2:
Process Mapper 3:
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/srv/paws/lib/python3.6/site-packages/para/map.py", line 142, in run
    for value in self.process(item):
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/srv/paws/lib/python3.6/site-packages/para/map.py", line 142, in run
    for value in self.process(item):
  File "/srv/paws/lib/python3.6/site-packages/mwxml/map/map.py", line 47, in process_path
    yield from process(dump, path)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/srv/paws/lib/python3.6/site-packages/mwxml/map/map.py", line 47, in process_path
    yield from process(dump, path)
  File "/srv/paws/lib/python3.6/site-packages/para/map.py", line 142, in run
    for value in self.process(item):
  File "<ipython-input-18-fdcbe42a1014>", line 8, in process_dump
    for revision in page:
  File "<ipython-input-18-fdcbe42a1014>", line 8, in process_dump
    for revision in page:
  File "/srv/paws/lib/python3.6/site-packages/mwxml/iteration/page.py", line 32, in __iter__
    for revision in self.__revisions:
  File "/srv/paws/lib/python3.6/site-packages/mwxml/map/map.py", line 47, in process_path
    yield from process(dump, path)
  File "/srv/paws/lib/python3.6/site-packages/mwxml/iteration/page.py", line 32, in __iter__
    for revision in self.__revisions:
  File "/srv/paws/lib/python3.6/site-packages/para/map.py", line 142, in run
    for value in self.process(item):
  File "/srv/paws/lib/python3.6/site-packages/mwxml/iteration/page.py", line 50, in load_revisions
    yield Revision.from_element(sub_element)
  File "/srv/paws/lib/python3.6/site-packages/mwxml/map/map.py", line 47, in process_path
    yield from process(dump, path)
  File "<ipython-input-18-fdcbe42a1014>", line 8, in process_dump
    for revision in page:
  File "/srv/paws/lib/python3.6/site-packages/mwxml/iteration/page.py", line 50, in load_revisions
    yield Revision.from_element(sub_element)
  File "/srv/paws/lib/python3.6/site-packages/mwxml/iteration/revision.py", line 58, in from_element
    text = sub_element.text
  File "/srv/paws/lib/python3.6/site-packages/mwxml/element_iterator.py", line 89, in __getattr__
    self.complete()
  File "<ipython-input-18-fdcbe42a1014>", line 9, in process_dump
    paragraphs = re.split('\n', str(revision.text))
  File "/srv/paws/lib/python3.6/site-packages/mwxml/iteration/page.py", line 32, in __iter__
    for revision in self.__revisions:
  File "/srv/paws/lib/python3.6/site-packages/mwxml/element_iterator.py", line 72, in complete
    event, element = next(self.pointer)
  File "/srv/paws/lib/python3.6/site-packages/mwxml/iteration/page.py", line 50, in load_revisions
    yield Revision.from_element(sub_element)
  File "/srv/paws/lib/python3.6/site-packages/mwxml/element_iterator.py", line 22, in __next__
    event, element = next(self.etree_events)
  File "/srv/paws/lib/python3.6/site-packages/mwxml/iteration/revision.py", line 58, in from_element
    text = sub_element.text
  File "/usr/lib/python3.6/re.py", line 212, in split
    return _compile(pattern, flags).split(string, maxsplit)
  File "/usr/lib/python3.6/xml/etree/ElementTree.py", line 1223, in iterator
    data = source.read(16 * 1024)
  File "/srv/paws/lib/python3.6/site-packages/mwxml/element_iterator.py", line 89, in __getattr__
    self.complete()
  File "/usr/lib/python3.6/bz2.py", line 195, in read1
    return self._buffer.read1(size)
  File "/srv/paws/lib/python3.6/site-packages/mwxml/iteration/revision.py", line 58, in from_element
    text = sub_element.text
  File "/usr/lib/python3.6/_compression.py", line 68, in readinto
    data = self.read(len(byte_view))
KeyboardInterrupt
  File "/srv/paws/lib/python3.6/site-packages/mwxml/element_iterator.py", line 72, in complete
    event, element = next(self.pointer)
  File "/usr/lib/python3.6/_compression.py", line 103, in read
    data = self._decompressor.decompress(rawblock, size)
  File "/srv/paws/lib/python3.6/site-packages/mwxml/element_iterator.py", line 89, in __getattr__
    self.complete()
KeyboardInterrupt
  File "/srv/paws/lib/python3.6/site-packages/mwxml/element_iterator.py", line 22, in __next__
    event, element = next(self.etree_events)
  File "/srv/paws/lib/python3.6/site-packages/mwxml/element_iterator.py", line 72, in complete
    event, element = next(self.pointer)
  File "/srv/paws/lib/python3.6/site-packages/mwxml/element_iterator.py", line 22, in __next__
    event, element = next(self.etree_events)
  File "/usr/lib/python3.6/xml/etree/ElementTree.py", line 1223, in iterator
    data = source.read(16 * 1024)
  File "/usr/lib/python3.6/bz2.py", line 195, in read1
    return self._buffer.read1(size)
KeyboardInterrupt
  File "/usr/lib/python3.6/xml/etree/ElementTree.py", line 1223, in iterator
    data = source.read(16 * 1024)
  File "/usr/lib/python3.6/bz2.py", line 195, in read1
    return self._buffer.read1(size)
  File "/usr/lib/python3.6/_compression.py", line 68, in readinto
    data = self.read(len(byte_view))
  File "/usr/lib/python3.6/_compression.py", line 103, in read
    data = self._decompressor.decompress(rawblock, size)
KeyboardInterrupt
pages_kw
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-6623b315597f> in <module>()
----> 1 pages_kw

NameError: name 'pages_kw' is not defined