Skip to content

Latest commit

 

History

History
119 lines (81 loc) · 3.84 KB

python pandas snnipets.md

File metadata and controls

119 lines (81 loc) · 3.84 KB

Python Pandas Code snippets

# ========== Import ==========

import pandas as pd
# import csv file to df
df = pd.read_csv("demo.csv")
# import excel file to df
df = pd.read_excel("demo.xlsx")

# ========== Concatenate ==========

df["address"] = df["address"] + ", " + df["city"] + ", " + df["state"] + " " + df["zip"].astype("str")
# option 2
df["address"] = df[["address", "city", "state", "zip"]].apply(lambda x: ", ".join(x), axis=1)
# note: if one of the column is not of type "str", convert it first to "str", like this:
df["zip"] = df["zip"].astype("str")

# ========== Split ==========

df["last name"] = df["full name"].str.split(", ").str[0]
df["first name"] = df["full name"].str.split(", ").str[1]

# ========== Replace a substring ==========

df["address"] = df["address"].str.replace("St", "Street")

# replace substring in multiple columns
df[["address", "city"]] = df[["address", "city"]].replace("a", "@", regex=True)
# note: you have to specify "regex=True". if not, only a whole string will be replaced, not a substring.

# replace substring in a whole data-frame
df = df.replace("a", "@", regex=True)

# multiple replacements
df["address"] = df["address"].replace({"St": "Street", "Ave": "Avenue"}, regex=True)

# replace substring in headers
df.columns = df.columns.str.replace("c", "@")
# you can also use "regex=True" with this
df.columns = df.columns.str.replace(r"\s{2,}", " ", regex=True)

# ========== Trim and remove double spaces ==========

df = df.replace(r"\s{2,}", " ", regex=True)
df = df.replace(r"^\s|\s$", "", regex=True)
# note: this doesn't work for headers

# trim headers
df = df.rename(str.strip, axis=1)

# ========== Rename Columns ==========

df = df.rename({"address": "address1", "city": "city1"}, axis=1)

# ========== add columns to a data-frame ==========

df[["column1", "column2"]] = "" # this will add 2 columns with all values as empty strings.

# to add column from a list even if the list contains columns that are already in the df, use a list comprehension. otherwise it will override the original column
insert_columns = ["address", "column1", "column2"]
df[[c for c in insert_columns if c not in df.columns]] = ""

# ========== drop columns ==========

...

# ========== filter a data-frame ==========

df = df[df["state"] == "TX"]
# or
df = df[df["amount"] > 5]

# ========== Grouping ==========

# When grouping a column, it returns a "GroupBy" object. To convert it back to a DataFrame, you need to aggregate one or more of the other columns and then reset the index. If you just want the count of all other columns, you can use `size()`, which will return a Series. To convert it back to a DataFrame, reset the index and pass the parameter `name=name of count column`.

# This will return the state column and the sum of the amount column.
df = df.groupby("state")[["amount"]].sum()  # df is a DataFrame. (Grouping and Aggregation in 1 line)
df = df.reset_index()

# Multiple aggregations.
df = df.groupby("address")  # df is a GroupBy object
df = df.agg({"state": "nunique", "amount": "sum"})  # df is converted back to a DataFrame
df = df.reset_index()

# Group by 2 columns and the count of all other rows.
df = df.groupby(["address", "state"]).size()  # df is a Series
df = df.reset_index(name="count of rows")  # df is converted back to a DataFrame

# ========== Export ==========

df.to_csv("demo.csv", index=False)

# ========== Styling ==========
style = [
    """\
color: green;\
background-color: #D3D3D3;\
font-family: Menlo;\
font-style: italic;\
font-size: 18px\
"""
]

df_styled = df.style.apply(lambda x: style * len(x))

# then export
df_styled.to_excel("styled_data.xlsx", engine="openpyxl", index=False)

# conditional formatting (applies this style where the "state" = "TX")
df_styled = df.style.apply(lambda x: df["state"].map({"TX": "color: red"}))
# then export