from bs4 import BeautifulSoup
import requests
import re

DOIs = []
doi_root = 'http.://.*doi.org/'

url='http://www.doi.org/'

urls = [
    'http://www.doi.org/',
    'https://www.ncbi.nlm.nih.gov/pubmed/',
    'https://doaj.org/',
    'http://www.persee.fr/',
    'https://journals.openedition.org/',
    'http://www.scielo.br/',
    'https://www.elsevier.com',
    'https://www.openaccessjournals.com/',
    'https://journals.sagepub.com/home/sgo',
]
def get_content(url):
    r = requests.get(url)
    html_doc = r.content
    soup = BeautifulSoup(html_doc, 'html.parser',from_encoding="iso-8859-1")
    #print(soup.prettify()) #debug only
    return soup
# def extract DOIs

def add_dois(url,DOIs,doi_root):
    soup = get_content(url)
    for link in soup.find_all('a'):
        href = str(link.get('href'))
        #print(href)
        if re.search(doi_root, href):
            href = re.sub(doi_root, '', href)
            #print(href)
            DOIs.append(href)
    return DOIs    
for url in urls:
    add_dois(url,DOIs,doi_root)

for DOI in DOIs:
    print(DOI)
10.1109/5.771073