import mwparserfromhell as mw
from bs4 import BeautifulSoup
import unicodedata
import csv
import io
dict_keys = ['page', 'sentence']

with open('./pages_first_step_parsed.csv', 'w', newline='') as myfile:
    wr = csv.DictWriter(myfile, dict_keys)
    wr.writeheader()
    
    with io.open('./pages_first_step.csv', encoding = 'utf8') as file: #open file to read
        reader = csv.DictReader(file, delimiter=',', quotechar='"')
        for row in reader:
            sentence = BeautifulSoup(row['sentence']).get_text()
            sentence = unicodedata.normalize("NFKD", sentence)
            sentence = re.sub(r'(\[\[(\w+)(\|[\w\-]+)?\]\])', r'\2', sentence)
            sentence = re.sub(r'(?<![</])ref', r'<ref', sentence)
            sentence = re.sub('<[^>]+>', '', sentence)
            write = {}
            write['page'] = row['page']
            write['sentence'] = sentence
            wr.writerow(write)
/srv/paws/lib/python3.6/site-packages/bs4/__init__.py:335: UserWarning: "http://wcfcourier.com/app/special/flood_buyout/" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/srv/paws/lib/python3.6/site-packages/bs4/__init__.py:335: UserWarning: "http://www.rochdaleobserver.co.uk/news/s/524/524518_ancient_flooded_highway_finally_runs_its_course.html" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/srv/paws/lib/python3.6/site-packages/bs4/__init__.py:335: UserWarning: "https://web.archive.org/web/20110721163023/http://www.igovernment.in/site/Bihars-flood-of-fury-End-of-Kosi-civilisation/" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
text = 'About 7,000 acres (28&nbsp;km<sup>2</sup>) in the center of the refuge are made up of [[Floodplain|flood-plains]] watered by irrigation systems connected to the Rio Grande.'
soup = BeautifulSoup(text)
text2 = soup.get_text()
new_str = unicodedata.normalize("NFKD", text2)
replaced = re.sub(r'(\[\[(\w+)(\|[\w\-]+)?\]\])', r'\2', new_str)
replaced
'About 7,000 acres (28 km2) in the center of the refuge are made up of Floodplain watered by irrigation systems connected to the Rio Grande.'
 
text2 = 'ref name="Marspower"></ref><ref name="Eg├ąpower"></ref> To aid the new treatment plants, and avoid floodings,'
t = re.sub(r'(?<![</])ref', r'<ref', text2)
print(t)
t = re.sub('<[^>]+>', '', t)
<ref name="Marspower"></ref><ref name="Eg├ąpower"></ref> To aid the new treatment plants, and avoid floodings,
' To aid the new treatment plants, and avoid floodings,'