import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
sns.set(style='whitegrid')
warnings.filterwarnings('ignore')
pd.show_versions()
INSTALLED VERSIONS
------------------
commit : a671b5a8bf5dd13fb19f0e88edc679bc9e15c673
python : 3.11.5.final.0
python-bits : 64
OS : Windows
OS-release : 10
Version : 10.0.22621
machine : AMD64
processor : Intel64 Family 6 Model 154 Stepping 3, GenuineIntel
byteorder : little
LC_ALL : None
LANG : None
LOCALE : Chinese (Simplified)_China.936
pandas : 2.1.4
numpy : 1.24.3
pytz : 2023.3.post1
dateutil : 2.8.2
setuptools : 68.2.2
pip : 23.3.1
Cython : None
pytest : 7.4.0
hypothesis : None
sphinx : 5.0.2
blosc : None
feather : None
xlsxwriter : None
lxml.etree : 4.9.3
html5lib : 1.1
pymysql : 1.0.2
psycopg2 : None
jinja2 : 3.1.2
IPython : 8.15.0
pandas_datareader : None
bs4 : 4.12.2
bottleneck : 1.3.5
dataframe-api-compat: None
fastparquet : None
fsspec : 2023.10.0
gcsfs : None
matplotlib : 3.8.0
numba : 0.58.1
numexpr : 2.8.7
odfpy : None
openpyxl : 3.0.10
pandas_gbq : None
pyarrow : 11.0.0
pyreadstat : None
pyxlsb : None
s3fs : 2023.10.0
scipy : 1.11.4
sqlalchemy : 2.0.21
tables : 3.8.0
tabulate : 0.9.0
xarray : 2023.6.0
xlrd : 2.0.1
zstandard : 0.19.0
tzdata : 2023.3
qtpy : 2.4.1
pyqt5 : None
分组的核心操作分为三步:
df_english=pd.DataFrame(
{
"student": ["John", "James", "Jennifer"],
"gender": ["male", "male", "female"],
"score": [20, 30, 30],
"subject": "english"
}
)
df_math=pd.DataFrame(
{
"student": ["John", "James", "Jennifer"],
"gender": ["male", "male", "female"],
"score": [90, 100, 95],
"subject": "math"
}
)
df = pd.concat([df_english, df_math],ignore_index=True)
df = df.sort_values(['student','subject']).reset_index(drop=True)
df
student | gender | score | subject | |
---|---|---|---|---|
0 | James | male | 30 | english |
1 | James | male | 100 | math |
2 | Jennifer | female | 30 | english |
3 | Jennifer | female | 95 | math |
4 | John | male | 20 | english |
5 | John | male | 90 | math |
# 不指定列
df.groupby(by='student').count()
gender | score | subject | |
---|---|---|---|
student | |||
James | 2 | 2 | 2 |
Jennifer | 2 | 2 | 2 |
John | 2 | 2 | 2 |
# 指定列
df.groupby(by='student')['score'].count()
student
James 2
Jennifer 2
John 2
Name: score, dtype: int64
groupby()
默认有排序,如果不想要排序可以设置sort=False
关闭。
df.groupby(by=['student', 'score'])['subject'].agg(['count', 'size'])
count | size | ||
---|---|---|---|
student | score | ||
James | 30 | 1 | 1 |
100 | 1 | 1 | |
Jennifer | 30 | 1 | 1 |
95 | 1 | 1 | |
John | 20 | 1 | 1 |
90 | 1 | 1 |
df.groupby(by=['student', 'score'], sort=False)['subject'].agg(['count', 'size'])
count | size | ||
---|---|---|---|
student | score | ||
James | 30 | 1 | 1 |
100 | 1 | 1 | |
Jennifer | 30 | 1 | 1 |
95 | 1 | 1 | |
John | 20 | 1 | 1 |
90 | 1 | 1 |
groupby()
会把分组列作为繁华dataframe的索引,如果不想这样,可以设置as_index=False
参数。
df.groupby(by='student')['score'].agg(['mean', 'max', 'min'])
mean | max | min | |
---|---|---|---|
student | |||
James | 65.0 | 100 | 30 |
Jennifer | 62.5 | 95 | 30 |
John | 55.0 | 90 | 20 |
df.groupby(by='student', as_index=False)['score'].agg(['mean', 'max', 'min'])
student | mean | max | min | |
---|---|---|---|---|
0 | James | 65.0 | 100 | 30 |
1 | Jennifer | 62.5 | 95 | 30 |
2 | John | 55.0 | 90 | 20 |
groupby()
分组时默认忽略空值,但实际中有时需要对变量空值进行统计,可设置参数dropna=False
。
dd = pd.DataFrame({
'A':['a1','a1','a2','a3'],
'B':['b1',None,'b2','b3'],
'C':[1,2,3,4],
'D':[5,None,9,10]})
dd
A | B | C | D | |
---|---|---|---|---|
0 | a1 | b1 | 1 | 5.0 |
1 | a1 | None | 2 | NaN |
2 | a2 | b2 | 3 | 9.0 |
3 | a3 | b3 | 4 | 10.0 |
# 默认忽略空值
dd.groupby('B').sum()
A | C | D | |
---|---|---|---|
B | |||
b1 | a1 | 1 | 5.0 |
b2 | a2 | 3 | 9.0 |
b3 | a3 | 4 | 10.0 |
# 分组空值加入统计
dd.groupby('B', dropna=False).sum()
A | C | D | |
---|---|---|---|
B | |||
b1 | a1 | 1 | 5.0 |
b2 | a2 | 3 | 9.0 |
b3 | a3 | 4 | 10.0 |
NaN | a1 | 2 | 0.0 |
除数据列外,我们也可以通过索引层级来实现分组。
df = df.set_index(['student', 'subject'])
df
gender | score | ||
---|---|---|---|
student | subject | ||
James | english | male | 30 |
math | male | 100 | |
Jennifer | english | female | 30 |
math | female | 95 | |
John | english | male | 20 |
math | male | 90 |
# 指定level=0分组
df.groupby(level=0).count()
gender | score | |
---|---|---|
student | ||
James | 2 | 2 |
Jennifer | 2 | 2 |
John | 2 | 2 |
# 指定level=1分组
df.groupby(level=1).count()
gender | score | |
---|---|---|
subject | ||
english | 3 | 3 |
math | 3 | 3 |
即在分组对象后应用聚合函数,有内置方法和自定义方法两种。
方法 | 概述 |
---|---|
any() | 计算组中的任何数是否为真 |
all() | 计算组中的任何数是否全都为真 |
count() | 计算组中非NA的数量 |
cov()* | 计算组的协方差 |
first() | 计算每组第一个出现的值 |
idxmax()* | 最大值的索引 |
idxmin()* | 最小值的索引 |
last() | 每组最后一个出现的值 |
max() | 每组的最大值 |
min() | 每组的最小值 |
mean() | 每组的平均数 |
median() | 每组的中位数 |
nunique() | 每组中唯一值的数量 |
prod() | 每组的乘积 |
quantile() | 每组中值的给定分位数 |
sem() | 计算每组中值的平均值的标准差 |
size() | 值的数量 |
skew()* | 值的偏差 |
std() | 值的标准偏差 |
sum() | 值的总和 |
var() | 值的方差 |
# 集中趋势
# 按学生分组计算平均分数
df.groupby(['student'])['score'].mean()
# 按学生分组计算中位数分数
df.groupby(['student'])['score'].median()
# 按学生分组计算指定分位数
df.groupby(['student'])['score'].quantile()
student
James 65.0
Jennifer 62.5
John 55.0
Name: score, dtype: float64
# 离散趋势
# 按学生分组计算最大分数
df.groupby(['student'])['score'].max()
# 按学生分组计算最小分数
df.groupby(['student'])['score'].min()
# 按学生分组计算方差
df.groupby(['student'])['score'].var()
# 按学生分组计算协方差
# df.groupby(['student'])['score'].cov()
# 按学生分组计算偏差
df.groupby(['student'])['score'].skew()
# 按学生分组计算标准差
df.groupby(['student'])['score'].std()
student
James 49.497475
Jennifer 45.961941
John 49.497475
Name: score, dtype: float64
# 计数
# 按学生分组计算非空数量
df.groupby(['student'])['score'].count()
# 按学生分组计算所有值数量
df.groupby(['student'])['score'].size()
# 按学生分组计算唯一值数量
df.groupby(['student'])['score'].nunique()
# 按学生分组计算求和
df.groupby(['student'])['score'].sum()
# 按学生分组计算求乘积
df.groupby(['student'])['score'].prod()
student
James 3000
Jennifer 2850
John 1800
Name: score, dtype: int64
# 排序
# 按学生分组返回第一个score
df.groupby(['student'])['score'].first()
# 按学生分组返回最后一个score
df.groupby(['student'])['score'].last()
# 按学生分组返回第n个score
df.groupby(['student'])['score'].nth(1)
student subject
James math 100
Jennifer math 95
John math 90
Name: score, dtype: int64
分箱时将连续数值变量变为离散分类变量的一种方法,常用于特征工程,这对于组的划分有更高的要求,而不是简单的将相同值划分为同一组。
分箱的分为很多种,简单的有等频和等距,如果涉及到算法的,常见的有决策树、卡方、最优分箱等,这里只给出最简单的能由pandas
实现的等频和等距两种方法。
df = pd.read_csv('data/titanic.csv')
df.sample(5)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
414 | 415 | 1 | 3 | Sundman, Mr. Johan Julian | male | 44.0 | 0 | 0 | STON/O 2. 3101269 | 7.925 | NaN | S |
728 | 729 | 0 | 2 | Bryhl, Mr. Kurt Arnold Gottfrid | male | 25.0 | 1 | 0 | 236853 | 26.000 | NaN | S |
186 | 187 | 1 | 3 | O'Brien, Mrs. Thomas (Johanna "Hannah" Godfrey) | female | NaN | 1 | 0 | 370365 | 15.500 | NaN | Q |
449 | 450 | 1 | 1 | Peuchen, Major. Arthur Godfrey | male | 52.0 | 0 | 0 | 113786 | 30.500 | C104 | S |
382 | 383 | 0 | 3 | Tikkanen, Mr. Juho | male | 32.0 | 0 | 0 | STON/O 2. 3101293 | 7.925 | NaN | S |
即按相同距离将连续的数值划分成不同的箱体,每个箱体的间距相等,但箱体内的数据并不一定相等,用pd.cut()
实现。
pd.cut(df['Age'], bins=6)
0 (13.683, 26.947]
1 (26.947, 40.21]
2 (13.683, 26.947]
3 (26.947, 40.21]
4 (26.947, 40.21]
...
886 (26.947, 40.21]
887 (13.683, 26.947]
888 NaN
889 (13.683, 26.947]
890 (26.947, 40.21]
Name: Age, Length: 891, dtype: category
Categories (6, interval[float64, right]): [(0.34, 13.683] < (13.683, 26.947] < (26.947, 40.21] < (40.21, 53.473] < (53.473, 66.737] < (66.737, 80.0]]
pd.cut(df['Age'], bins=6).value_counts().sort_index()
Age
(0.34, 13.683] 71
(13.683, 26.947] 248
(26.947, 40.21] 245
(40.21, 53.473] 100
(53.473, 66.737] 43
(66.737, 80.0] 7
Name: count, dtype: int64
当bins
为标量序列时,cut
会按照我们自定义的边界点进行分箱,不再是等距分箱。
pd.cut(df['Age'], bins=[0, 10, 30, 50, 100])
0 (10.0, 30.0]
1 (30.0, 50.0]
2 (10.0, 30.0]
3 (30.0, 50.0]
4 (30.0, 50.0]
...
886 (10.0, 30.0]
887 (10.0, 30.0]
888 NaN
889 (10.0, 30.0]
890 (30.0, 50.0]
Name: Age, Length: 891, dtype: category
Categories (4, interval[int64, right]): [(0, 10] < (10, 30] < (30, 50] < (50, 100]]
pd.cut(df['Age'], bins=[0, 10, 30, 50, 100]).value_counts().sort_index()
Age
(0, 10] 64
(10, 30] 345
(30, 50] 241
(50, 100] 64
Name: count, dtype: int64
等频要求分组内的数量相等,在pandas
中用qcut
实现。
pd.qcut(df['Age'], q=10).value_counts().sort_index()
Age
(0.419, 14.0] 77
(14.0, 19.0] 87
(19.0, 22.0] 67
(22.0, 25.0] 70
(25.0, 28.0] 61
(28.0, 31.8] 66
(31.8, 36.0] 91
(36.0, 41.0] 53
(41.0, 50.0] 78
(50.0, 80.0] 64
Name: count, dtype: int64
当分类型变量有很多值但分布比较分散的时候,我们有时需要做合并的处理。
df['Embarked'].value_counts(dropna=False)
Embarked
S 644
C 168
Q 77
NaN 2
Name: count, dtype: int64
where
可接受的筛选条件必须是布尔型的,基本判断逻辑是:如果对象不满足匹配条件,就被赋值为默认空值或other
参数指定的值,满足条件的保持原值不变。
cond = df['Embarked'].isin(['S','C'])
df['Embarked'].where(cond, other='other', inplace=True)
df['Embarked'].value_counts(dropna=False)
Embarked
S 644
C 168
other 79
Name: count, dtype: int64
df.loc[~df['Embarked'].isin(['S','C']), 'Embarked'] = 'other'
df['Embarked'].value_counts(dropna=False)
Embarked
S 644
C 168
other 79
Name: count, dtype: int64
有时候我们会对分箱结果不满意,例如其中连着的两个分箱内数量太少不满足统计最小要求,因此选择合并。
# 先分箱
a, b = pd.cut(df['Age'], bins=8, retbins=True)
print(b)
print('-'*8)
a.value_counts(dropna=False).sort_index()
[ 0.34042 10.3675 20.315 30.2625 40.21 50.1575 60.105 70.0525
80. ]
--------
Age
(0.34, 10.368] 64
(10.368, 20.315] 115
(20.315, 30.263] 230
(30.263, 40.21] 155
(40.21, 50.158] 86
(50.158, 60.105] 42
(60.105, 70.052] 17
(70.052, 80.0] 5
NaN 177
Name: count, dtype: int64
# (60.105, 70.052]和(60.105, 70.052]箱体数量太少,选择合并
bins = [0.34042,10.3675,20.315,30.2625,40.21,50.1575,60.105,80]
a, b = pd.cut(df['Age'], bins=bins, retbins=True)
print(b)
print('-'*8)
a.value_counts(dropna=False).sort_index()
[ 0.34042 10.3675 20.315 30.2625 40.21 50.1575 60.105 80. ]
--------
Age
(0.34, 10.368] 64
(10.368, 20.315] 115
(20.315, 30.262] 230
(30.262, 40.21] 155
(40.21, 50.158] 86
(50.158, 60.105] 42
(60.105, 80.0] 22
NaN 177
Name: count, dtype: int64