Pandas Data Types

Accompanying the PB Python article here

import pandas as pd
import numpy as np
df = pd.read_csv("https://github.com/chris1610/pbpython/blob/master/data/sales_data_types.csv?raw=True")
df
Customer Number Customer Name 2016 2017 Percent Growth Jan Units Month Day Year Active
0 10002.0 Quest Industries $125,000.00 $162500.00 30.00% 500 1 10 2015 Y
1 552278.0 Smith Plumbing $920,000.00 $101,2000.00 10.00% 700 6 15 2014 Y
2 23477.0 ACME Industrial $50,000.00 $62500.00 25.00% 125 3 29 2016 Y
3 24900.0 Brekke LTD $350,000.00 $490000.00 4.00% 75 10 27 2015 Y
4 651029.0 Harbor Co $15,000.00 $12750.00 -15.00% Closed 2 2 2014 N

Use df.info and df.dtypes to look at the types that pandas automatically infers based on the data

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 10 columns):
Customer Number    5 non-null float64
Customer Name      5 non-null object
2016               5 non-null object
2017               5 non-null object
Percent Growth     5 non-null object
Jan Units          5 non-null object
Month              5 non-null int64
Day                5 non-null int64
Year               5 non-null int64
Active             5 non-null object
dtypes: float64(1), int64(3), object(6)
memory usage: 480.0+ bytes

df.dtypes

Since the 2016 and 2017 columns were read in as objects, trying to add the values will result in string concatenation not numerical addition

df['2016'] + df['2017']
0      $125,000.00$162500.00
1    $920,000.00$101,2000.00
2        $50,000.00$62500.00
3      $350,000.00$490000.00
4        $15,000.00$12750.00
dtype: object

The simplest way to to convert to a type is using astype.

We can apply it to the customer number first.

df['Customer Number'].astype('int')
0     10002
1    552278
2     23477
3     24900
4    651029
Name: Customer Number, dtype: int64

The code above does not alter the original dataframe

df.dtypes
Customer Number    float64
Customer Name       object
2016                object
2017                object
Percent Growth      object
Jan Units           object
Month                int64
Day                  int64
Year                 int64
Active              object
dtype: object

Assign the new integer customer number back to the original frame and check the type

df["Customer Number"] = df['Customer Number'].astype('int')
df.dtypes
Customer Number     int64
Customer Name      object
2016               object
2017               object
Percent Growth     object
Jan Units          object
Month               int64
Day                 int64
Year                int64
Active             object
dtype: object
df
Customer Number Customer Name 2016 2017 Percent Growth Jan Units Month Day Year Active
0 10002 Quest Industries $125,000.00 $162500.00 30.00% 500 1 10 2015 Y
1 552278 Smith Plumbing $920,000.00 $101,2000.00 10.00% 700 6 15 2014 Y
2 23477 ACME Industrial $50,000.00 $62500.00 25.00% 125 3 29 2016 Y
3 24900 Brekke LTD $350,000.00 $490000.00 4.00% 75 10 27 2015 Y
4 651029 Harbor Co $15,000.00 $12750.00 -15.00% Closed 2 2 2014 N

The data all looks good for the Customer Number.

If we try to convert the Jan Units column, we will get an error.

df['Jan Units'].astype('int')
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-10-31333711e4a4> in <module>()
----> 1 df['Jan Units'].astype('int')

~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    116                 else:
    117                     kwargs[new_arg_name] = new_arg_value
--> 118             return func(*args, **kwargs)
    119         return wrapper
    120     return _deprecate_kwarg

~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs)
   4002         # else, only a single dtype is given
   4003         new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 4004                                      **kwargs)
   4005         return self._constructor(new_data).__finalize__(self)
   4006 

~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/internals.py in astype(self, dtype, **kwargs)
   3460 
   3461     def astype(self, dtype, **kwargs):
-> 3462         return self.apply('astype', dtype=dtype, **kwargs)
   3463 
   3464     def convert(self, **kwargs):

~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
   3327 
   3328             kwargs['mgr'] = self
-> 3329             applied = getattr(b, f)(**kwargs)
   3330             result_blocks = _extend_blocks(applied, result_blocks)
   3331 

~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/internals.py in astype(self, dtype, copy, errors, values, **kwargs)
    542     def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
    543         return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 544                             **kwargs)
    545 
    546     def _astype(self, dtype, copy=False, errors='raise', values=None,

~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs)
    623 
    624                 # _astype_nansafe works fine with 1-d only
--> 625                 values = astype_nansafe(values.ravel(), dtype, copy=True)
    626                 values = values.reshape(self.shape)
    627 

~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy)
    690     elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer):
    691         # work around NumPy brokenness, #1987
--> 692         return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
    693 
    694     if dtype.name in ("datetime64", "timedelta64"):

pandas/_libs/lib.pyx in pandas._libs.lib.astype_intsafe()

pandas/_libs/src/util.pxd in util.set_value_at_unsafe()

ValueError: invalid literal for int() with base 10: 'Closed'

In a similar manner we get an error if we try to convert the sales column

df['2016'].astype('float')
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-11-999869d577b0> in <module>()
----> 1 df['2016'].astype('float')

~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    116                 else:
    117                     kwargs[new_arg_name] = new_arg_value
--> 118             return func(*args, **kwargs)
    119         return wrapper
    120     return _deprecate_kwarg

~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs)
   4002         # else, only a single dtype is given
   4003         new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 4004                                      **kwargs)
   4005         return self._constructor(new_data).__finalize__(self)
   4006 

~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/internals.py in astype(self, dtype, **kwargs)
   3460 
   3461     def astype(self, dtype, **kwargs):
-> 3462         return self.apply('astype', dtype=dtype, **kwargs)
   3463 
   3464     def convert(self, **kwargs):

~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
   3327 
   3328             kwargs['mgr'] = self
-> 3329             applied = getattr(b, f)(**kwargs)
   3330             result_blocks = _extend_blocks(applied, result_blocks)
   3331 

~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/internals.py in astype(self, dtype, copy, errors, values, **kwargs)
    542     def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
    543         return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 544                             **kwargs)
    545 
    546     def _astype(self, dtype, copy=False, errors='raise', values=None,

~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs)
    623 
    624                 # _astype_nansafe works fine with 1-d only
--> 625                 values = astype_nansafe(values.ravel(), dtype, copy=True)
    626                 values = values.reshape(self.shape)
    627 

~/miniconda3/envs/pbp3/lib/python3.6/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy)
    701 
    702     if copy:
--> 703         return arr.astype(dtype)
    704     return arr.view(dtype)
    705 

ValueError: could not convert string to float: '$15,000.00'

We can try to use astype with a bool type but that does not give expected results

df['Active'].astype('bool')
0    True
1    True
2    True
3    True
4    True
Name: Active, dtype: bool
# astype can take a dictionary of column names and data types
df.astype({'Customer Number': 'int', 'Customer Name': 'str'}).dtypes
Customer Number     int64
Customer Name      object
2016               object
2017               object
Percent Growth     object
Jan Units          object
Month               int64
Day                 int64
Year                int64
Active             object
dtype: object

In order to convert the currency and percentages, we need to use custom functions

def convert_currency(val):
    """
    Convert the string number value to a float
     - Remove $
     - Remove commas
     - Convert to float type
    """
    new_val = val.replace(',','').replace('$', '')
    return float(new_val)
def convert_percent(val):
    """
    Convert the percentage string to an actual floating point percent
    """
    new_val = val.replace('%', '')
    return float(new_val) / 100

Use apply to convert the 2016 and 2017 columns to floating point numbers

df['2016'].apply(convert_currency)
0    125000.0
1    920000.0
2     50000.0
3    350000.0
4     15000.0
Name: 2016, dtype: float64
df['2017'].apply(convert_currency)
0     162500.0
1    1012000.0
2      62500.0
3     490000.0
4      12750.0
Name: 2017, dtype: float64

We could use a lambda function as well but it may be more difficult for new users to understand

df['2016'].apply(lambda x: x.replace('$', '').replace(',', '')).astype('float')
0    125000.0
1    920000.0
2     50000.0
3    350000.0
4     15000.0
Name: 2016, dtype: float64
# Assign the converted values back to the columns
df['2016'] = df['2016'].apply(convert_currency)
df['2017'] = df['2017'].apply(convert_currency)

Use a lambda function to convert the percentage strings to numbers

df['Percent Growth'].apply(lambda x: x.replace('%', '')).astype('float') / 100
0    0.30
1    0.10
2    0.25
3    0.04
4   -0.15
Name: Percent Growth, dtype: float64
df['Percent Growth'] = df['Percent Growth'].apply(convert_percent)
df.dtypes
Customer Number      int64
Customer Name       object
2016               float64
2017               float64
Percent Growth     float64
Jan Units           object
Month                int64
Day                  int64
Year                 int64
Active              object
dtype: object
# Let's look at the data so far
df
Customer Number Customer Name 2016 2017 Percent Growth Jan Units Month Day Year Active
0 10002 Quest Industries 125000.0 162500.0 0.30 500 1 10 2015 Y
1 552278 Smith Plumbing 920000.0 1012000.0 0.10 700 6 15 2014 Y
2 23477 ACME Industrial 50000.0 62500.0 0.25 125 3 29 2016 Y
3 24900 Brekke LTD 350000.0 490000.0 0.04 75 10 27 2015 Y
4 651029 Harbor Co 15000.0 12750.0 -0.15 Closed 2 2 2014 N

pd.to_numeric is another option for handling column conversions when invalid values are included

pd.to_numeric(df['Jan Units'], errors='coerce')
0    500.0
1    700.0
2    125.0
3     75.0
4      NaN
Name: Jan Units, dtype: float64
# Fill in the NaN with 0
pd.to_numeric(df['Jan Units'], errors='coerce').fillna(0)
0    500.0
1    700.0
2    125.0
3     75.0
4      0.0
Name: Jan Units, dtype: float64

Make sure to populate the original column of data

df["Jan Units"] = pd.to_numeric(df['Jan Units'], errors='coerce').fillna(0)

pd.to_datetime is very useful for working with date conversions

pd.to_datetime(df[['Month', 'Day', 'Year']])
0   2015-01-10
1   2014-06-15
2   2016-03-29
3   2015-10-27
4   2014-02-02
dtype: datetime64[ns]
df["Start_Date"] = pd.to_datetime(df[['Month', 'Day', 'Year']])
# Check out the dataframe
df
Customer Number Customer Name 2016 2017 Percent Growth Jan Units Month Day Year Active Start_Date
0 10002 Quest Industries 125000.0 162500.0 0.30 500.0 1 10 2015 Y 2015-01-10
1 552278 Smith Plumbing 920000.0 1012000.0 0.10 700.0 6 15 2014 Y 2014-06-15
2 23477 ACME Industrial 50000.0 62500.0 0.25 125.0 3 29 2016 Y 2016-03-29
3 24900 Brekke LTD 350000.0 490000.0 0.04 75.0 10 27 2015 Y 2015-10-27
4 651029 Harbor Co 15000.0 12750.0 -0.15 0.0 2 2 2014 N 2014-02-02

Use np.where to convert the active column to a boolean

df["Active"] = np.where(df["Active"] == "Y", True, False)
df
Customer Number Customer Name 2016 2017 Percent Growth Jan Units Month Day Year Active Start_Date
0 10002 Quest Industries 125000.0 162500.0 0.30 500.0 1 10 2015 True 2015-01-10
1 552278 Smith Plumbing 920000.0 1012000.0 0.10 700.0 6 15 2014 True 2014-06-15
2 23477 ACME Industrial 50000.0 62500.0 0.25 125.0 3 29 2016 True 2016-03-29
3 24900 Brekke LTD 350000.0 490000.0 0.04 75.0 10 27 2015 True 2015-10-27
4 651029 Harbor Co 15000.0 12750.0 -0.15 0.0 2 2 2014 False 2014-02-02
df.dtypes
Customer Number             int64
Customer Name              object
2016                      float64
2017                      float64
Percent Growth            float64
Jan Units                 float64
Month                       int64
Day                         int64
Year                        int64
Active                       bool
Start_Date         datetime64[ns]
dtype: object

Many of the examples shown above can be used when reading in data using dtypes or converters arguments

df_2 = pd.read_csv("https://github.com/chris1610/pbpython/blob/master/data/sales_data_types.csv?raw=True", 
                   dtype={'Customer Number':'int'},
                   converters={'2016':convert_currency,
                               '2017': convert_currency,
                               'Percent Growth': convert_percent,
                               'Jan Units': lambda x: pd.to_numeric(x, errors='coerce'),
                               'Active': lambda x: np.where(x == "Y", True, False)
                              })
df_2.dtypes
Customer Number      int64
Customer Name       object
2016               float64
2017               float64
Percent Growth     float64
Jan Units          float64
Month                int64
Day                  int64
Year                 int64
Active              object
dtype: object
df_2
Customer Number Customer Name 2016 2017 Percent Growth Jan Units Month Day Year Active
0 10002 Quest Industries 125000.0 162500.0 0.30 500.0 1 10 2015 True
1 552278 Smith Plumbing 920000.0 1012000.0 0.10 700.0 6 15 2014 True
2 23477 ACME Industrial 50000.0 62500.0 0.25 125.0 3 29 2016 True
3 24900 Brekke LTD 350000.0 490000.0 0.04 75.0 10 27 2015 True
4 651029 Harbor Co 15000.0 12750.0 -0.15 NaN 2 2 2014 False
# This can not be applied at the time the data is read in
df_2["Start_Date"] = pd.to_datetime(df_2[['Month', 'Day', 'Year']])
df_2
Customer Number Customer Name 2016 2017 Percent Growth Jan Units Month Day Year Active Start_Date
0 10002 Quest Industries 125000.0 162500.0 0.30 500.0 1 10 2015 True 2015-01-10
1 552278 Smith Plumbing 920000.0 1012000.0 0.10 700.0 6 15 2014 True 2014-06-15
2 23477 ACME Industrial 50000.0 62500.0 0.25 125.0 3 29 2016 True 2016-03-29
3 24900 Brekke LTD 350000.0 490000.0 0.04 75.0 10 27 2015 True 2015-10-27
4 651029 Harbor Co 15000.0 12750.0 -0.15 NaN 2 2 2014 False 2014-02-02