- DataFrame is like a table, consisting rows and column.
- DataFrame is a list of Series.
- Series is a list of values.
- Multiple series become a DataFrame.
import pandas as pd
d = {'A': 11, 'B': 11}
df = pd.DataFrame.from_records([d])
df
A | B | |
---|---|---|
0 | 11 | 11 |
d1 = {'A': 11, 'B': 12}
d2 = {'A': 21, 'B': 22}
d3 = {'A': 31, 'B': 32}
d4 = {'A': 41, 'B': 42}
d5 = {'A': 51, 'B': 52}
d_list = [d1, d2, d3, d4, d5]
df = pd.DataFrame.from_records(d_list)
df
A | B | |
---|---|---|
0 | 11 | 12 |
1 | 21 | 22 |
2 | 31 | 32 |
3 | 41 | 42 |
4 | 51 | 52 |
dv = {
'A': [11, 21, 31, 41, 51],
'B': [12, 22, 32, 42, 52],
}
df = pd.DataFrame.from_dict(dv)
df
A | B | |
---|---|---|
0 | 11 | 12 |
1 | 21 | 22 |
2 | 31 | 32 |
3 | 41 | 42 |
4 | 51 | 52 |
df.columns
Index(['A', 'B'], dtype='object')
list(df.columns)
['A', 'B']
n = 0
df.columns[n]
'A'
df['A'] # or
df.A
0 11
1 21
2 31
3 41
4 51
Name: A, dtype: int64
df[['A']]
A | |
---|---|
0 | 11 |
1 | 21 |
2 | 31 |
3 | 41 |
4 | 51 |
Keep in mind that there is no way to get multiple columns as Series hence DataFrame itself is multiple Series.
df[['A','B']]
A | B | |
---|---|---|
0 | 11 | 12 |
1 | 21 | 22 |
2 | 31 | 32 |
3 | 41 | 42 |
4 | 51 | 52 |
df # or
df[:]
A | B | |
---|---|---|
0 | 11 | 12 |
1 | 21 | 22 |
2 | 31 | 32 |
3 | 41 | 42 |
4 | 51 | 52 |
df[0:2]
A | B | |
---|---|---|
0 | 11 | 12 |
1 | 21 | 22 |
Using 2 steps gets you even rows
df[0:4:2]
A | B | |
---|---|---|
0 | 11 | 12 |
2 | 31 | 32 |
df[::2]
A | B | |
---|---|---|
0 | 11 | 12 |
2 | 31 | 32 |
4 | 51 | 52 |
df[2:]
A | B | |
---|---|---|
2 | 31 | 32 |
3 | 41 | 42 |
4 | 51 | 52 |
df[:3]
A | B | |
---|---|---|
0 | 11 | 12 |
1 | 21 | 22 |
2 | 31 | 32 |
df[::-1]
A | B | |
---|---|---|
4 | 51 | 52 |
3 | 41 | 42 |
2 | 31 | 32 |
1 | 21 | 22 |
0 | 11 | 12 |
- DataFrames have an index.
- Index is row number by default and starts with 0.
- Index can be defined while DataFrame created with 'index' parameter. If not provided, row numbers are used as index by default.
- Index is also a series if it consists only one column.
- Multiple column indexes are also supported.
Default index is row number which can be seen in output
df
A | B | |
---|---|---|
0 | 11 | 12 |
1 | 21 | 22 |
2 | 31 | 32 |
3 | 41 | 42 |
4 | 51 | 52 |
Which is from 0 to 4 in the above example
Since it is the default index, it is a range index
df.index
RangeIndex(start=0, stop=5, step=1)
dv = {
'i': [1, 2, 3, 4, 5],
'A': [11, 21, 31, 41, 51],
'B': [12, 22, 32, 42, 52],
}
df2 = pd.DataFrame.from_dict(dv).set_index('i')
df2
A | B | |
---|---|---|
i | ||
1 | 11 | 12 |
2 | 21 | 22 |
3 | 31 | 32 |
4 | 41 | 42 |
5 | 51 | 52 |
Here the i
column become an index
df2.index
Int64Index([1, 2, 3, 4, 5], dtype='int64', name='i')
As seen above, now the index is a list of integers (int64)
df2.index[3]
4
df2.loc[3]
A 31
B 32
Name: 3, dtype: int64
df2
A | B | |
---|---|---|
i | ||
1 | 11 | 12 |
2 | 21 | 22 |
3 | 31 | 32 |
4 | 41 | 42 |
5 | 51 | 52 |
n = 3
df2.iloc[n]
A 41
B 42
Name: 4, dtype: int64
# or
df2[n:n+1]
A | B | |
---|---|---|
i | ||
4 | 41 | 42 |
The difference between df2.iloc[n]
and df2[n:n+1]
is, iloc
always returns a Series, while the other returns a DataFrame.
These helpers will make your life easy
def df_col(self, column) -> pd.Series:
if type(column) == int:
return self[self.columns[column]].copy()
if type(column) == str:
return self[column].copy()
raise ValueError(f"col parameter must be either type of 'str' or type of 'int' which '{column}' is not")
def df_cols(self, *columns) -> pd.DataFrame:
cols_list = list(columns)
new_cols_list = []
for c in cols_list:
if type(c) == int:
new_cols_list.append(self.columns[c])
else:
new_cols_list.append(c)
return self[new_cols_list].copy()
def df_row(self, ix) -> pd.Series:
return self.iloc[ix].copy()
def ser_col(self, column):
return self.get(column)
def ser_cols(self, *columns):
result = []
for c in list(columns):
result.append(self.get(c))
return tuple(result)
def ser_row(self, ix):
return self.values[ix]
pd.DataFrame.col = df_col
pd.DataFrame.cols = df_cols
pd.DataFrame.row = df_row
pd.DataFrame.rows = pd.DataFrame.iloc
pd.Series.col = pd.Series.get
pd.Series.cols = ser_cols
pd.Series.row = ser_row
pd.Series.rows = pd.Series.values
Returns Series
df2.col(0) # Get first column as Series
df2.col('A') # Get column 'A' as Series
i
1 11
2 21
3 31
4 41
5 51
Name: A, dtype: int64
Returns DataFrame
df2.cols(0,1) # Get column first and second as DataFrame
df2.cols('A', 'B') # Get column 'A' and 'B' as DataFrame
A | B | |
---|---|---|
i | ||
1 | 11 | 12 |
2 | 21 | 22 |
3 | 31 | 32 |
4 | 41 | 42 |
5 | 51 | 52 |
Returns Series
df2.row(0) # Get first row as Series
A 11
B 12
Name: 1, dtype: int64
Returns DataFrame
df2.rows[0:3] # Get rows from 0 to 3 (excluding 3) as DataFrame
A | B | |
---|---|---|
i | ||
1 | 11 | 12 |
2 | 21 | 22 |
3 | 31 | 32 |
df2.row(0).col(0) # Get first column of first row as value
df2.row(0).col('A') # Get column 'A' of first row as value
11
df2.row(0).cols(0, 1) # Get first two columns of first row as tuple
df2.row(0).cols('A', 'B') # Get 'A' and 'B' columns of first row as tuple
(11, 12)
df2.rows[0:3].col(0) # Get first column of rows from 0 to 3 (excluding 3) as Series
df2.rows[0:3].col('A') # Get column 'A' of rows from 0 to 3 (excluding 3) as Series
i
1 11
2 21
3 31
Name: A, dtype: int64
df2.rows[0:3].cols(0, 1) # Get first two column of rows from 0 to 3 (excluding 3) as DataFrame
df2.rows[0:3].cols('A', 'B') # Get 'A' and 'B' columns of rows from 0 to 3 (excluding 3) as DataFrame
A | B | |
---|---|---|
i | ||
1 | 11 | 12 |
2 | 21 | 22 |
3 | 31 | 32 |
df2.col(0).row(0) # Get first row of first column as value
df2.col('A').row(0) # Get first row of column 'A' as value
11
df2.col(0).rows[0:3] # Get rows from 0 to 3(excluding) of first column as numpy.ndarray
df2.col('A').rows[0:3] # Get rows from 0 to 3(excluding) of column 'A' as numpy.ndarray
array([11, 21, 31])
df2.cols(0, 1).row(0) # Get first row of first two columns as Series
df2.cols('A', 'B').row(0) # Get first row of columns 'A' and 'B' as Series
A 11
B 12
Name: 1, dtype: int64
df.cols(0, 1).rows[0:3] # Get rows from 0 to 3(excluding) of first two columns as DataFrame
df.cols('A', 'B').rows[0:3] # Get rows from 0 to 3(excluding) of of columns 'A' and 'B' as DataFrame
A | B | |
---|---|---|
0 | 11 | 12 |
1 | 21 | 22 |
2 | 31 | 32 |