I/O
# read a csv file with the first row as index columns
df = pd.read_csv(filename, index_col=0)
# write df to csv file
df.to_csv(filename)
Exploratory data analysis
Consider these attributes:
Necessary, but not sufficient
- of known provenance
- well-annotated
Important
- timely
- complete
- high volume
- multivariate
- atomic
Achievable
- Consistent
- Clear
- Dimensionally structured
# show the df's first ten rows
df.head(10)
# show the df's tail
df.tail()
# show row 501-503
df.iloc[501:504]
# only show select columns of iloc
df.iloc[501:504, [column1, column2, ...]]
# shape
df.shape
# show information about the data frame
df.info()
# count of rows
len(df)
# count of columns
len(df.columns)
# show df's columns
df.columns
# show columns' data types
df.dtypes
# show mean, quartiles, etc
df.describe()
# show nulls in the df's columns
df.isna().sum()
# value counts of column
df[column].value_counts()
# sort by column and show the first 50 entries
df.sort_values(by=column).head(50)
# maximum value of column
df[column].max()
# show 0.05 quantile of column
df[column].quantile(0.05)
df[column].unique()
# how many unique values does column have
df[column].nunique()
# mean and median of column1 grouped by column2
df.groupby(column2)[column1].mean()
df.groupby(column2)[column1].median()
# mean and median of column1 grouped by column2 and column3
df.groupby(column2, column3)[column1].mean()
df.groupby(column2, column3)[column1].median()
Data wrangling
# rename a column inplace
df.rename(columns = {old_name:new_name}, inplace = True)
# perform multiplication on all values of a column
df[column] = df[column]*1.1
# convert string column to date column
df[column] = pd.to_datetime(df[column], format="%Y-%m-%d")
# replace values in columns
recoding = {
old1: new1,
old2: new2
}
df[column] = df[column].replace(recoding)
df[column].replace(recoding, inplace=True)
# fill missing values
df.column.fillna(impulation_value)
df.column.fillna(method="ffill") # ... for time series
# drop all rows that contain nulls
df.dropna(axis=0)
# drop all columns that contain nulls
df.dropna(axis=1)
# drop rows 0, 2, 4
df.drop([0, 2, 4], axis=0)
# drop duplicate rows
df = df.drop_duplicates()
Querying
# show all rows of df where column equals value
df[df[column]==value]
# querying with loc
df.loc[df.column <= value]
df.loc[(df.column1 <= value1) & (df.column2 == value2)]
Visualization
# histogram of column
df.hist(column)
# bar chart of column
df[column] \
.value_counts() \
.plot.bar(figsize=(10,5), title="...", xlabel="...", ylabel="...")
plt.show()
# line diagram of column1 grouped by column2
df.groupby([column2])[column1] \
.mean() \
.plot(kind="line", figsize=(..., ...), title="...", xlabel="...", ylabel="...")
plt.show()
# show line diagram grouped by column1 and unstack by column2
df.groupby([column1, column2]) \
.size() \
.unstack(column2) \
.plot(kind="line", figsize=(..., ...), title="...", xlabel="...", ylabel="...")
plt.show()
# with cumsum
df.groupby([column1, column2]).size() \
.groupby(level=1).cumsum() \
.unstack(column2) \
.plot(kind="line", figsize=(..., ...), title="...", xlabel="...", ylabel="...")
plt.show()
# ecdf (empirical cumulative distribution function) plot
sns.ecdfplot(df, x=column1, hue=column2, stat="count"). \
set(title="...", xlabel="...", ylabel="...")
# pie chart with subplots
df \
.groupby([column1, column2]) \
.size() \
.unstack(column2) \
.plot \
.plot(kind="line", figsize=(..., ...), title="...", xlabel="...", ylabel="...")
plt.show()