I/O

# read a csv file with the first row as index columns
df = pd.read_csv(filename, index_col=0)

# write df to csv file
df.to_csv(filename)

Exploratory data analysis

Consider these attributes:

  • Necessary, but not sufficient

    • of known provenance
    • well-annotated
  • Important

    • timely
    • complete
    • high volume
    • multivariate
    • atomic
  • Achievable

    • Consistent
    • Clear
    • Dimensionally structured
# show the df's first ten rows
df.head(10)

# show the df's tail
df.tail()

# show row 501-503
df.iloc[501:504]

# only show select columns of iloc
df.iloc[501:504, [column1, column2, ...]]

# shape
df.shape

# show information about the data frame
df.info()

# count of rows
len(df)

# count of columns
len(df.columns)

# show df's columns
df.columns

# show columns' data types
df.dtypes

# show mean, quartiles, etc
df.describe()

# show nulls in the df's columns
df.isna().sum()

# value counts of column
df[column].value_counts()

# sort by column and show the first 50 entries
df.sort_values(by=column).head(50)

# maximum value of column
df[column].max()

# show 0.05 quantile of column
df[column].quantile(0.05)

df[column].unique()

# how many unique values does column have
df[column].nunique()

# mean and median of column1 grouped by column2
df.groupby(column2)[column1].mean()
df.groupby(column2)[column1].median()

# mean and median of column1 grouped by column2 and column3
df.groupby(column2, column3)[column1].mean()
df.groupby(column2, column3)[column1].median()

Data wrangling

# rename a column inplace
df.rename(columns = {old_name:new_name}, inplace = True)

# perform multiplication on all values of a column
df[column] = df[column]*1.1

# convert string column to date column
df[column] = pd.to_datetime(df[column], format="%Y-%m-%d")

# replace values in columns
recoding = {
    old1: new1,
    old2: new2
}

df[column] = df[column].replace(recoding)
df[column].replace(recoding, inplace=True)

# fill missing values
df.column.fillna(impulation_value)
df.column.fillna(method="ffill") # ... for time series

# drop all rows that contain nulls
df.dropna(axis=0)

# drop all columns that contain nulls
df.dropna(axis=1)

# drop rows 0, 2, 4
df.drop([0, 2, 4], axis=0)

# drop duplicate rows
df = df.drop_duplicates()

Querying


# show all rows of df where column equals value
df[df[column]==value]

# querying with loc
df.loc[df.column <= value]
df.loc[(df.column1 <= value1) & (df.column2 == value2)]

Visualization


# histogram of column
df.hist(column)

# bar chart of column
df[column] \
    .value_counts() \
    .plot.bar(figsize=(10,5), title="...", xlabel="...", ylabel="...")
plt.show()

# line diagram of column1 grouped by column2
df.groupby([column2])[column1] \
    .mean() \
    .plot(kind="line", figsize=(..., ...), title="...", xlabel="...", ylabel="...")
plt.show()

# show line diagram grouped by column1 and unstack by column2
df.groupby([column1, column2]) \
    .size() \
    .unstack(column2) \
    .plot(kind="line", figsize=(..., ...), title="...", xlabel="...", ylabel="...")
plt.show()

# with cumsum
df.groupby([column1, column2]).size() \
    .groupby(level=1).cumsum() \
    .unstack(column2) \
    .plot(kind="line", figsize=(..., ...), title="...", xlabel="...", ylabel="...")
plt.show()

# ecdf (empirical cumulative distribution function) plot
sns.ecdfplot(df, x=column1, hue=column2, stat="count"). \
    set(title="...", xlabel="...", ylabel="...")

# pie chart with subplots
df \
    .groupby([column1, column2]) \
    .size() \
    .unstack(column2) \
    .plot \
    .plot(kind="line", figsize=(..., ...), title="...", xlabel="...", ylabel="...")
plt.show()