Skip to main content

Pandas Cheat Sheet

Importing Pandas

import pandas as pd

Loading a CSV or JSON file

df = pd.read_csv('Transactions.csv')
df = pd.read_json('Data.csv')

Dropping a column

df = df.drop(['Balance'], axis=1)

Examining Column Data Types

df.dtypes

Date object
Description object
Credit float64
Debit float64
dtype: object

Converting a Column to Datetime

df['Date'] = pd.to_datetime(df['Date'], format="%d/%m/%Y")

Combining two columns

df['Amount'] = df.bfill(axis=1).iloc[:, 2]

Check for Nulls

df.isnan().values.any()

Set More useful table display widths

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

Extract a Regex into a new column

df['Extract'] = df['Description'].str.extract('(.*) - (Visa Purchase|Osko Payment|Internal Transfer|Receipt)')

Pandas number formatting options

pd.set_option('display.float_format', lambda x: f'{x:.3f}')
pd.set_option('display.float_format', lambda x: f'{x:,.3f}')
pd.set_option('display.precision', 2) # set max decimal places in dataframe

Load CSV

df=pd.read_csv('allcars.csv', index_col=0, low_memory=False)

DataFrames Info

df.info()

DataFrames Description

df.describe()
df['SalePrice'].describe()

DataFrames Head

df.head()

Merging DataFrames

pd.concat([dataframe1, dataframe2], ignore_index = True) # vertically
pd.concat([dataframe1, dataframe2], axis=1) # horizontally

## merge two dataframes based on nearest time with a 60s window
merge.merge_asof(df1, df2, on=’time’, by=’ticker’, tolerance=pd.Timedelta(‘60s’)), direction = ‘nearest’

DataFrames misc...

df.shape
df.columns
df.isna()
df.isnull()

Convert Column Types

df['timeStamp'] = pd.to_datetime(df['timeStamp'])

Sort by Columns

df.sort_values(['speed','tPos'], ascending=[False, True]).head()

Column Operations

df['deviceID'].value_counts()
df[['tripID','deviceID']].value_counts()

Unique Values

df['deviceID'].unique()

Create a Subset of Columns

df_subset=df[[‘tripID’,’deviceID’,’speed’]]
df_subset.head()

Set the Index Column

df.set_index('Name')
df.reset_index()

Filter Rows / Columns

df[df['tripID']==0]

By Label

df.loc[(df.tripID == 0) & (df.deviceID == 1) & (df.speed > 10)]

By Index

df.iloc[0:4, 0:3] # row slicing, column slicing

Aggregates

df.groupby('deviceID')['speed'].mean()
df.groupby('deviceID')['speed'].max()
df['speed'].max()
df['speed'].min()
df['speed'].sum()

Visualisations

ref: https://www.kaggle.com/code/pmarcelino/comprehensive-data-exploration-with-python/notebook

Histograms and Distributions


import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.displot(df['speed'])
sns.histplot(df['speed'])

sns.histplot(df.loc[(df['gps_speed'] > 0)]['gps_speed'])

## set axis limts
ax = sns.histplot(df['gps_speed'])
ax.set(ylim=(0,100000))
ax.set(xlim=(0,100))

Scatter Plots

data=pd.concat([df['tPos'], df['speed']], axis=1)
data.plot.scatter(x='speed',y='tPos', ylim=(0, 200), xlim=(0,200))

Correlation Matrix

# overall data frame
corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

# saleprice correlation matrix
k = 10 # number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

Variable Pair Correlations

train_df=df[['rpm', 'kpl', 'gps_speed']].dropna()
sns.set()
sns.pairplot(train_df, height=2.5)
plt.show()

![](images/Pasted image 20230719212634.png)

Load JSON data

file_name="TMA_20230727_TTD_Sample/202307270000020e05-001_TTD_DAT_251690415988069.json"  

import json

with open(file_name) as data_file:
file_contents = data_file.read()

parsed_json = json.loads(file_contents)

Load JSON into Pandas DataFrame

df = pd.json_normalize(parsed_json,  
record_path =['deviceRecords', 'records'],
meta=['tdeVersion', 'batchId', ['deviceRecords','device','id']]
)