from bs4 import BeautifulSoup
import requests
import re
DOIs = []
doi_root = 'http.://.*doi.org/'
url='http://www.doi.org/'
urls = [
'http://www.doi.org/',
'https://www.ncbi.nlm.nih.gov/pubmed/',
'https://doaj.org/',
'http://www.persee.fr/',
'https://journals.openedition.org/',
'http://www.scielo.br/',
'https://www.elsevier.com',
'https://www.openaccessjournals.com/',
'https://journals.sagepub.com/home/sgo',
]
def get_content(url):
r = requests.get(url)
html_doc = r.content
soup = BeautifulSoup(html_doc, 'html.parser',from_encoding="iso-8859-1")
#print(soup.prettify()) #debug only
return soup
# def extract DOIs
def add_dois(url,DOIs,doi_root):
soup = get_content(url)
for link in soup.find_all('a'):
href = str(link.get('href'))
#print(href)
if re.search(doi_root, href):
href = re.sub(doi_root, '', href)
#print(href)
DOIs.append(href)
return DOIs
for url in urls:
add_dois(url,DOIs,doi_root)
for DOI in DOIs:
print(DOI)