import csv
from urllib.request import urlopen
# import codecs
# from tqdm import tqdm
# from tqdm._tqdm_notebook import tqdm_notebook
import numpy as np
import pandas as pd

#### PANDAS defaults for displays

pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 500)

##### Setup SPARQL queries

import urllib.parse
from IPython.display import IFrame

def wdq(query='',width=800,height=500):
  return IFrame(baseurl+urllib.parse.quote(query), width=width, height=height)

import requests

wikidata_api_url    = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
met_api_url         = 'https://collectionapi.metmuseum.org/public/collection/v1/objects/'
crosswalk_sheet_url = 'https://docs.google.com/spreadsheets/d/1WmXW2CjlLidcUXzahQsB3HjUVvECns4xDyIt-Hw-jW8/export?format=csv&id'

metdtypes = {
    'Object Number':str,
    'Is Highlight':bool,
    'Is Public Domain':bool,
    'Object ID':int,
    'Object Name':str,
    'Artist Role':str,
    'Artist Prefix':str,
    'Artist Display Name':str,
    'Artist Display Bio':str,
    'Artist Suffix':str,
    'Artist Alpha Sort':str,
    'Artist Nationality':str,
    'Artist Begin Date':str,
    'Artist End Date':str,
    'Object Date':str,
    'Object Begin Date':int,
    'Object End Date':int,
    'Credit Line':str,
    'Geography Type':str,
    'Rights and Reproduction':str,
    'Link Resource':str,
    'Metadata Date':str,

# Use local copy of CSV or HDF file for speed
databasefile = 'metmuseum/MetObjects-latest'

import os.path

hdf_file = databasefile+'.h5'
csv_file = databasefile+'.csv'

if (os.path.isfile(hdf_file)):
    # Read the HDF file into a pandas dataframe
    print ('Working on HDF')
    df = pd.read_hdf(hdf_file,dtype=metdtypes,low_memory=False)
    if (os.path.isfile(csv_file)):
        print ('Working on CSV')
        df = pd.read_csv(csv_file,dtype=metdtypes,low_memory=False)

# Make a new shallow/efficient copy of dataframe (data in place) with just highlights
# hdf = df[df['Is Highlight'] == True].copy()
Working on CSV
CPU times: user 19 s, sys: 2.29 s, total: 21.3 s
Wall time: 5min 14s
CPU times: user 19 s, sys: 2.29 s, total: 21.3 s
Wall time: 5min 14s
# If you need to convert dataframe columns
# columns = ['Object Number']
# df.loc[:,columns] = df[columns].applymap(int)
print ('Converting to HDF')
df.to_hdf(hdf_file, key='met', mode='w',format='table')
Converting to HDF
Converting to HDF
Object Number              object
Is Highlight                 bool
Is Public Domain             bool
Object ID                   int64
Department                 object
Object Name                object
Title                      object
Culture                    object
Period                     object
Dynasty                    object
Reign                      object
Portfolio                  object
Artist Role                object
Artist Prefix              object
Artist Display Name        object
Artist Display Bio         object
Artist Suffix              object
Artist Alpha Sort          object
Artist Nationality         object
Artist Begin Date          object
Artist End Date            object
Object Date                object
Object Begin Date           int64
Object End Date             int64
Medium                     object
Dimensions                 object
Credit Line                object
Geography Type             object
City                       object
State                      object
County                     object
Country                    object
Region                     object
Subregion                  object
Locale                     object
Locus                      object
Excavation                 object
River                      object
Classification             object
Rights and Reproduction    object
Link Resource              object
Metadata Date              object
Repository                 object
Tags                       object
dtype: object
# df.dtypes
df[['Object Date','Object Begin Date','Object End Date']].sample(10)
Object Date Object Begin Date Object End Date
352394 1914; original ca. 1390–1353 B.C. -1390 -1353
442545 ca. 1920–29 1920 1930
330759 1916 1916 1916
34499 18th century 1700 1799
161680 1932 1932 1932
326679 1947 1947 1947
314794 7th century 600 699
339996 ca. 2030–1640 B.C. -2030 -1640
327733 1943 1943 1943
365210 ca. 1336–1327 B.C. -1336 -1336
# import h5py
# file = databasefile+'.h5'
# df.to_hdf(file, key='met', mode='w')