%%time
import csv
from urllib.request import urlopen
# import codecs
# from tqdm import tqdm
# from tqdm._tqdm_notebook import tqdm_notebook
import numpy as np
import pandas as pd

#### PANDAS defaults for displays

pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 500)

##### Setup SPARQL queries

import urllib.parse
from IPython.display import IFrame
baseurl='https://query.wikidata.org/embed.html#'

def wdq(query='',width=800,height=500):
  return IFrame(baseurl+urllib.parse.quote(query), width=width, height=height)

import requests

wikidata_api_url    = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
met_api_url         = 'https://collectionapi.metmuseum.org/public/collection/v1/objects/'
crosswalk_sheet_url = 'https://docs.google.com/spreadsheets/d/1WmXW2CjlLidcUXzahQsB3HjUVvECns4xDyIt-Hw-jW8/export?format=csv&id'

metdtypes = {
    'Object Number':str,
    'Is Highlight':bool,
    'Is Public Domain':bool,
    'Object ID':int,
    'Department':str,
    'Object Name':str,
    'Title':str,
    'Culture':str,
    'Period':str,
    'Dynasty':str,
    'Reign':str,
    'Portfolio':str,
    'Artist Role':str,
    'Artist Prefix':str,
    'Artist Display Name':str,
    'Artist Display Bio':str,
    'Artist Suffix':str,
    'Artist Alpha Sort':str,
    'Artist Nationality':str,
    'Artist Begin Date':str,
    'Artist End Date':str,
    'Object Date':str,
    'Object Begin Date':int,
    'Object End Date':int,
    'Medium':str,
    'Dimensions':str,
    'Credit Line':str,
    'Geography Type':str,
    'City':str,
    'State':str,
    'County':str,
    'Country':str,
    'Region':str,
    'Subregion':str,
    'Locale':str,
    'Locus':str,
    'Excavation':str,
    'River':str,
    'Classification':str,
    'Rights and Reproduction':str,
    'Link Resource':str,
    'Metadata Date':str,
    'Repository':str,
    'Tags':str
}

# Use local copy of CSV or HDF file for speed
databasefile = 'metmuseum/MetObjects-20190903'

import os.path

file = databasefile+'.h5'

if (os.path.isfile(file)):
    # Read the HDF file into a pandas dataframe
    print ('Working on HDF')
    df = pd.read_hdf(file,dtype=metdtypes,low_memory=False)
else:
    file = databasefile+'.csv'
    if (os.path.isfile(file)):
        print ('Working on CSV')
        df = pd.read_csv(file,dtype=metdtypes,low_memory=False)

# Make a new shallow/efficient copy of dataframe (data in place) with just highlights
# hdf = df[df['Is Highlight'] == True].copy()
Working on HDF
CPU times: user 2 s, sys: 1.69 s, total: 3.68 s
Wall time: 4min 29s
df.dtypes
Object Number              object
Is Highlight                 bool
Is Public Domain             bool
Object ID                   int64
Department                 object
Object Name                object
Title                      object
Culture                    object
Period                     object
Dynasty                    object
Reign                      object
Portfolio                  object
Artist Role                object
Artist Prefix              object
Artist Display Name        object
Artist Display Bio         object
Artist Suffix              object
Artist Alpha Sort          object
Artist Nationality         object
Artist Begin Date          object
Artist End Date            object
Object Date                object
Object Begin Date           int64
Object End Date             int64
Medium                     object
Dimensions                 object
Credit Line                object
Geography Type             object
City                       object
State                      object
County                     object
Country                    object
Region                     object
Subregion                  object
Locale                     object
Locus                      object
Excavation                 object
River                      object
Classification             object
Rights and Reproduction    object
Link Resource              object
Metadata Date              object
Repository                 object
Tags                       object
dtype: object
# df.dtypes
df[['Object Date','Object Begin Date','Object End Date']].sample(10)
Object Date Object Begin Date Object End Date
33856 14th century 1300 1333
184045 1884 1884 1884
68267 1930–59 1930 1959
59459 18th–19th century 1700 1899
41956 NaN 618 907
416356 1889 1889 1889
255679 16th century 1600 1699
358482 ca. 1390–1352 B.C. -1390 -1390
114423 18th century 1700 1799
388885 1616 1616 1616
# import h5py
# file = databasefile+'.h5'
# df.to_hdf(file, key='met', mode='w')