import pandas as pd
import numpy as np
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside)) #create a list of tuples
hier_index
[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]
#create a multiindex
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index
MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
# Create a dataframe (6,2) with multi level index
df = pd.DataFrame(np.random.randn(6,2),index=hier_index,columns=['A','B'])
df
A | B | ||
---|---|---|---|
G1 | 1 | -0.349997 | 1.372488 |
2 | 0.090160 | 0.121530 | |
3 | 0.751559 | -0.335218 | |
G2 | 1 | 1.501890 | 0.835727 |
2 | -1.542555 | -1.878225 | |
3 | 0.721939 | -0.121186 |
Accessing rows and columns¶
You can use loc
and iloc
as a chain to access the elements. Go from outer index to inner index
#access columns as usual
df['A']
G1 1 -0.349997 2 0.090160 3 0.751559 G2 1 1.501890 2 -1.542555 3 0.721939 Name: A, dtype: float64
#access rows
df.loc['G1']
A | B | |
---|---|---|
1 | -0.349997 | 1.372488 |
2 | 0.090160 | 0.121530 |
3 | 0.751559 | -0.335218 |
#acess a single row form inner
df.loc['G1'].loc[1]
A -0.349997 B 1.372488 Name: 1, dtype: float64
#access a single cell
df.loc['G2'].loc[3]['B']
-0.12479824997165252
Naming indices¶
Indices can have names (appear similar to column names)
df.index.names
FrozenList([None, None])
df.index.names = ['Group', 'Serial']
df
A | B | ||
---|---|---|---|
Group | Serial | ||
G1 | 1 | -0.349997 | 1.372488 |
2 | 0.090160 | 0.121530 | |
3 | 0.751559 | -0.335218 | |
G2 | 1 | 1.501890 | 0.835727 |
2 | -1.542555 | -1.878225 | |
3 | 0.721939 | -0.121186 |
Accessing rows and columns using cross section¶
The xs
method allows to get a cross section. The advantage is it can penetrate a multilevel index in a single step. Now that we have named the indices, we can use cross section effectively
# Get all rows with Serial 1
df.xs(1, level='Serial')
A | B | |
---|---|---|
Group | ||
G1 | -0.349997 | 1.372488 |
G2 | 1.501890 | 0.835727 |
# Get rows with serial 2 in group 1
df.xs(['G1',2])
A 0.09016 B 0.12153 Name: (G1, 2), dtype: float64
d = {'a':[1,2,np.nan], 'b':[np.nan, 5, np.nan], 'c':[6,7,8]}
dfna = pd.DataFrame(d)
dfna
a | b | c | |
---|---|---|---|
0 | 1.0 | NaN | 6 |
1 | 2.0 | 5.0 | 7 |
2 | NaN | NaN | 8 |
# dropping rows with one or more na values
dfna.dropna()
a | b | c | |
---|---|---|---|
1 | 2.0 | 5.0 | 7 |
# dropping cols with one or more na values
dfna.dropna(axis=1)
c | |
---|---|
0 | 6 |
1 | 7 |
2 | 8 |
# Dropping rows only if 2 or more cols have na values
dfna.dropna(axis=0, thresh=2)
a | b | c | |
---|---|---|---|
0 | 1.0 | NaN | 6 |
1 | 2.0 | 5.0 | 7 |
fillna¶
dfna.fillna(value=999)
a | b | c | |
---|---|---|---|
0 | 1.0 | 999.0 | 6 |
1 | 2.0 | 5.0 | 7 |
2 | 999.0 | 999.0 | 8 |
# filling with mean value of entire dataframe
dfna.fillna(value = dfna.mean())
a | b | c | |
---|---|---|---|
0 | 1.0 | 5.0 | 6 |
1 | 2.0 | 5.0 | 7 |
2 | 1.5 | 5.0 | 8 |
# fill with mean value row by row
dfna['a'].fillna(value = dfna['a'].mean())
0 1.0 1 2.0 2 1.5 Name: a, dtype: float64
comp_data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
'Sales':[200,120,340,124,243,350]}
comp_df = pd.DataFrame(comp_data)
comp_df
Company | Person | Sales | |
---|---|---|---|
0 | GOOG | Sam | 200 |
1 | GOOG | Charlie | 120 |
2 | MSFT | Amy | 340 |
3 | MSFT | Vanessa | 124 |
4 | FB | Carl | 243 |
5 | FB | Sarah | 350 |
mean min max¶
# mean sales by company - automatically only applies mean on numerical columns
comp_df.groupby('Company').mean()
Sales | |
---|---|
Company | |
FB | 296.5 |
GOOG | 160.0 |
MSFT | 232.0 |
# standard deviation in sales by company
comp_df.groupby('Company').std()
Sales | |
---|---|
Company | |
FB | 75.660426 |
GOOG | 56.568542 |
MSFT | 152.735065 |
You can run other aggregation functions like mean, min, max, std, count
etc. Lets look at describe
which does all of it.
describe¶
comp_df.groupby('Company').describe()
Sales | ||||||||
---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | |
Company | ||||||||
FB | 2.0 | 296.5 | 75.660426 | 243.0 | 269.75 | 296.5 | 323.25 | 350.0 |
GOOG | 2.0 | 160.0 | 56.568542 | 120.0 | 140.00 | 160.0 | 180.00 | 200.0 |
MSFT | 2.0 | 232.0 | 152.735065 | 124.0 | 178.00 | 232.0 | 286.00 | 340.0 |
transpose¶
Long over due, you can tile a DF by calling the transpose()
method.
comp_df.groupby('Company').describe().transpose()
Company | FB | GOOG | MSFT | |
---|---|---|---|---|
Sales | count | 2.000000 | 2.000000 | 2.000000 |
mean | 296.500000 | 160.000000 | 232.000000 | |
std | 75.660426 | 56.568542 | 152.735065 | |
min | 243.000000 | 120.000000 | 124.000000 | |
25% | 269.750000 | 140.000000 | 178.000000 | |
50% | 296.500000 | 160.000000 | 232.000000 | |
75% | 323.250000 | 180.000000 | 286.000000 | |
max | 350.000000 | 200.000000 | 340.000000 |
comp_df.groupby('Company').describe().index
Index(['FB', 'GOOG', 'MSFT'], dtype='object', name='Company')
Combining DataFrames¶
You can concatenate, merge and join data frames.
Lets take a look at 3 DataFrames
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],'D': ['D0', 'D1', 'D2', 'D3']}, index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'], 'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],'D': ['D4', 'D5', 'D6', 'D7']}, index=[4, 5, 6, 7])
df1
A | B | C | D | |
---|---|---|---|---|
0 | A0 | B0 | C0 | D0 |
1 | A1 | B1 | C1 | D1 |
2 | A2 | B2 | C2 | D2 |
3 | A3 | B3 | C3 | D3 |
df2
A | B | C | D | |
---|---|---|---|---|
4 | A4 | B4 | C4 | D4 |
5 | A5 | B5 | C5 | D5 |
6 | A6 | B6 | C6 | D6 |
7 | A7 | B7 | C7 | D7 |
concat¶
pd.concat([list_of_df], axis=0)
will extend a dataframe either along rows or columns. All DF in the list should be of same dimension.
# extend along rows
pd.concat([df1, df2]) #flows well because index is sequential and colmns match
A | B | C | D | |
---|---|---|---|---|
0 | A0 | B0 | C0 | D0 |
1 | A1 | B1 | C1 | D1 |
2 | A2 | B2 | C2 | D2 |
3 | A3 | B3 | C3 | D3 |
4 | A4 | B4 | C4 | D4 |
5 | A5 | B5 | C5 | D5 |
6 | A6 | B6 | C6 | D6 |
7 | A7 | B7 | C7 | D7 |
#extend along columns
pd.concat([df1, df2], axis=1) #fills NaN when index dont match
A | B | C | D | A | B | C | D | |
---|---|---|---|---|---|---|---|---|
0 | A0 | B0 | C0 | D0 | NaN | NaN | NaN | NaN |
1 | A1 | B1 | C1 | D1 | NaN | NaN | NaN | NaN |
2 | A2 | B2 | C2 | D2 | NaN | NaN | NaN | NaN |
3 | A3 | B3 | C3 | D3 | NaN | NaN | NaN | NaN |
4 | NaN | NaN | NaN | NaN | A4 | B4 | C4 | D4 |
5 | NaN | NaN | NaN | NaN | A5 | B5 | C5 | D5 |
6 | NaN | NaN | NaN | NaN | A6 | B6 | C6 | D6 |
7 | NaN | NaN | NaN | NaN | A7 | B7 | C7 | D7 |
merge¶
merge lets you do a sql merge with inner, outer, right
and left
joins.
pd.merge(left, right, how='outer', on='key')
where, left
and right
are your two DataFrames (tables) and on
refers to the foreign key
left = pd.DataFrame({'key1': ['K0', 'K1', 'K2', 'K3'],'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K2', 'K3'],'B': ['C0', 'C1', 'C2', 'C3'],
'C': ['D0', 'D1', 'D2', 'D3']})
left
A | B | key1 | |
---|---|---|---|
0 | A0 | B0 | K0 |
1 | A1 | B1 | K1 |
2 | A2 | B2 | K2 |
3 | A3 | B3 | K3 |
right
B | C | key1 | |
---|---|---|---|
0 | C0 | D0 | K0 |
1 | C1 | D1 | K1 |
2 | C2 | D2 | K2 |
3 | C3 | D3 | K3 |
inner merge¶
Inner join keeps only the intersection.
#merge along key1
pd.merge(left, right, how='inner', on='key1')
A | B_x | key1 | B_y | C | |
---|---|---|---|---|---|
0 | A0 | B0 | K0 | C0 | D0 |
1 | A1 | B1 | K1 | C1 | D1 |
2 | A2 | B2 | K2 | C2 | D2 |
3 | A3 | B3 | K3 | C3 | D3 |
When both tables have same column names that are not used for merging (on
) then pandas appends x
and y
to their names to differentiate
merge on multiple columns¶
Sometimes, your foreign key is composite. Then you can merge on multiple keys by passing a list to the on
argument.
Now lets add a key2 column to both the tables.
left['key2'] = ['K0', 'K1', 'K0', 'K1']
left
A | B | key1 | key2 | |
---|---|---|---|---|
0 | A0 | B0 | K0 | K0 |
1 | A1 | B1 | K1 | K1 |
2 | A2 | B2 | K2 | K0 |
3 | A3 | B3 | K3 | K1 |
right['key2'] = ['K0', 'K0', 'K0', 'K0']
right
B | C | key1 | key2 | |
---|---|---|---|---|
0 | C0 | D0 | K0 | K0 |
1 | C1 | D1 | K1 | K0 |
2 | C2 | D2 | K2 | K0 |
3 | C3 | D3 | K3 | K0 |
pd.merge(left, right, how='inner', on=['key1', 'key2'])
A | B_x | key1 | key2 | B_y | C | |
---|---|---|---|---|---|---|
0 | A0 | B0 | K0 | K0 | C0 | D0 |
1 | A2 | B2 | K2 | K0 | C2 | D2 |
inner
merge will only keep the intersection, thus only 2 rows.
outer merge¶
Use how='outer'
to keep the union of both the tables. pandas fills NaN
when a cell has no values.
om = pd.merge(left, right, how='outer', on=['key1', 'key2'])
om
A | B_x | key1 | key2 | B_y | C | |
---|---|---|---|---|---|---|
0 | A0 | B0 | K0 | K0 | C0 | D0 |
1 | A1 | B1 | K1 | K1 | NaN | NaN |
2 | A2 | B2 | K2 | K0 | C2 | D2 |
3 | A3 | B3 | K3 | K1 | NaN | NaN |
4 | NaN | NaN | K1 | K0 | C1 | D1 |
5 | NaN | NaN | K3 | K0 | C3 | D3 |
Sorting¶
Use DataFrame.sort_values(by=columns, inplace=False, ascending=True)
to sort the table.
om.sort_values(by=['key1', 'key2']) #now you got the merge sorted by columns.
A | B_x | key1 | key2 | B_y | C | |
---|---|---|---|---|---|---|
0 | A0 | B0 | K0 | K0 | C0 | D0 |
4 | NaN | NaN | K1 | K0 | C1 | D1 |
1 | A1 | B1 | K1 | K1 | NaN | NaN |
2 | A2 | B2 | K2 | K0 | C2 | D2 |
5 | NaN | NaN | K3 | K0 | C3 | D3 |
3 | A3 | B3 | K3 | K1 | NaN | NaN |
right merge¶
how='right'
will keep all the rows of right table and drop the rows of left table that dont have a matching keys.
pd.merge(left, right, how='right', on=['key1', 'key2']).sort_values(by='key1')
A | B_x | key1 | key2 | B_y | C | |
---|---|---|---|---|---|---|
0 | A0 | B0 | K0 | K0 | C0 | D0 |
2 | NaN | NaN | K1 | K0 | C1 | D1 |
1 | A2 | B2 | K2 | K0 | C2 | D2 |
3 | NaN | NaN | K3 | K0 | C3 | D3 |
left merge¶
how='left'
will similarly keep all rows of left and those rows of right that has a matching foreign key.
pd.merge(left, right, how='left', on=['key1', 'key2']).sort_values(by='key1')
A | B_x | key1 | key2 | B_y | C | |
---|---|---|---|---|---|---|
0 | A0 | B0 | K0 | K0 | C0 | D0 |
1 | A1 | B1 | K1 | K1 | NaN | NaN |
2 | A2 | B2 | K2 | K0 | C2 | D2 |
3 | A3 | B3 | K3 | K1 | NaN | NaN |
join¶
Joins are like merges but work on index instead of columns. Further, they are by default either left
or right
with inner
as mode of joins. See example below:
df_a = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
df_b = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
df_a
A | B | |
---|---|---|
K0 | A0 | B0 |
K1 | A1 | B1 |
K2 | A2 | B2 |
df_b
C | D | |
---|---|---|
K0 | C0 | D0 |
K2 | C2 | D2 |
K3 | C3 | D3 |
#join b to a, default mode = keep all rows of a and matching rows of b (left join)
df_a.join(df_b)
A | B | C | D | |
---|---|---|---|---|
K0 | A0 | B0 | C0 | D0 |
K1 | A1 | B1 | NaN | NaN |
K2 | A2 | B2 | C2 | D2 |
Thus all rows of df_a and those in df_b. If df_b did not have that index, then NaN for values.
#join b to a
df_b.join(df_a)
C | D | A | B | |
---|---|---|---|---|
K0 | C0 | D0 | A0 | B0 |
K2 | C2 | D2 | A2 | B2 |
K3 | C3 | D3 | NaN | NaN |
#outer join - union of outputs
df_b.join(df_a, how='outer')
C | D | A | B | |
---|---|---|---|---|
K0 | C0 | D0 | A0 | B0 |
K1 | NaN | NaN | A1 | B1 |
K2 | C2 | D2 | A2 | B2 |
K3 | C3 | D3 | NaN | NaN |