Pandas

发布时间:2024年01月03日

pandas基本介绍

import pandas as pd
import numpy as np
s = pd.Series([1,3,6,np.nan,44,1]) #numpy如果是针对列表的话,pandas更像是字典
s # 在这可以理解为一维pandas
0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64
dates = pd.date_range('20160101',periods=6)  # 定义行索引
dates
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(6,4),index=dates)
df
0123
2016-01-010.310279-0.1134501.4535150.893409
2016-01-020.511068-0.088535-1.7514600.390180
2016-01-030.4152100.3527520.4318600.225930
2016-01-040.6497930.7436681.2500571.396353
2016-01-051.1457370.3381441.0777380.856458
2016-01-060.0376431.3753821.560754-0.435449
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d']) # 常规形式创建DataFrame
df
abcd
2016-01-011.117627-0.7965870.041202-0.772693
2016-01-02-0.987977-1.525442-0.6843780.007355
2016-01-03-0.255173-1.4447240.5994561.050332
2016-01-04-0.020769-0.354652-1.1112321.217364
2016-01-05-1.114441-0.0693030.4733850.425665
2016-01-061.157257-0.0810450.9735941.198853
df1 = pd.DataFrame(np.arange(12).reshape(3,4))
df1
0123
00123
14567
2891011
df2 = pd.DataFrame({'A':1.,
                    'B':pd.Timestamp('20230102'),
                    'C':pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D':np.array([3]*4,dtype='int32'),
                    'E':pd.Categorical(['test','train','test','train']),
                    'F':'foo'})  # 字典方式创建DataFrame
df2
ABCDEF
01.02023-01-021.03testfoo
11.02023-01-021.03trainfoo
21.02023-01-021.03testfoo
31.02023-01-021.03trainfoo
df2.dtypes # 每列的数据类型
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
df2.index # 行索引
Int64Index([0, 1, 2, 3], dtype='int64')
df2.columns # 列索引
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
df2.values  # 输出值
array([[1.0, Timestamp('2023-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2023-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2023-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2023-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)
df2.describe()  # 描述dataframe
ACD
count4.04.04.0
mean1.01.03.0
std0.00.00.0
min1.01.03.0
25%1.01.03.0
50%1.01.03.0
75%1.01.03.0
max1.01.03.0
df2.T #转置
0123
A1.01.01.01.0
B2023-01-02 00:00:002023-01-02 00:00:002023-01-02 00:00:002023-01-02 00:00:00
C1.01.01.01.0
D3333
Etesttraintesttrain
Ffoofoofoofoo
df2.sort_index(axis=1,ascending=False)  #对列进行倒序排序
FEDCBA
0footest31.02023-01-021.0
1footrain31.02023-01-021.0
2footest31.02023-01-021.0
3footrain31.02023-01-021.0
df2.sort_index(axis=0,ascending=False)  #对行进行倒序排序
ABCDEF
31.02023-01-021.03trainfoo
21.02023-01-021.03testfoo
11.02023-01-021.03trainfoo
01.02023-01-021.03testfoo
df2.sort_values(by='E')  # 对值进行排列
ABCDEF
01.02023-01-021.03testfoo
21.02023-01-021.03testfoo
11.02023-01-021.03trainfoo
31.02023-01-021.03trainfoo

pandas 选择数据

import pandas as pd
import numpy as np
dates = pd.date_range('20240101',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
df
ABCD
2024-01-010123
2024-01-024567
2024-01-03891011
2024-01-0412131415
2024-01-0516171819
2024-01-0620212223
df['A'] #显示每一列
2024-01-01     0
2024-01-02     4
2024-01-03     8
2024-01-04    12
2024-01-05    16
2024-01-06    20
Freq: D, Name: A, dtype: int32
df.A  #显示每一列
2024-01-01     0
2024-01-02     4
2024-01-03     8
2024-01-04    12
2024-01-05    16
2024-01-06    20
Freq: D, Name: A, dtype: int32
 df[0:3]  #选取前三行
ABCD
2024-01-010123
2024-01-024567
2024-01-03891011
df['20240102':'20240105']
ABCD
2024-01-024567
2024-01-03891011
2024-01-0412131415
2024-01-0516171819

select by label:loc 纯标签筛选

df.loc['20240103']  #以标签名义来选择,更具体一点
A     8
B     9
C    10
D    11
Name: 2024-01-03 00:00:00, dtype: int32
df.loc[:,['A','B']]  #保存所有行,选择A、B两列
AB
2024-01-0101
2024-01-0245
2024-01-0389
2024-01-041213
2024-01-051617
2024-01-062021
df.loc['20240102',['A','B']]
A    4
B    5
Name: 2024-01-02 00:00:00, dtype: int32

select by position:iloc 纯位置筛选

df
ABCD
2024-01-010123
2024-01-024567
2024-01-03891011
2024-01-0412131415
2024-01-0516171819
2024-01-0620212223
df.iloc[3] # 选择第三行数据
A    12
B    13
C    14
D    15
Name: 2024-01-04 00:00:00, dtype: int32
df.iloc[3,1]  # 选择第三行第一位数据
13
df.iloc[3:5,1:3]  
BC
2024-01-041314
2024-01-051718
df.iloc[[1,3,5],1:3] 
BC
2024-01-0256
2024-01-041314
2024-01-062122

mixed selection:ix 既有标签又有位置筛选

df
ABCD
2024-01-010123
2024-01-024567
2024-01-03891011
2024-01-0412131415
2024-01-0516171819
2024-01-0620212223
df.ix[:3,['A','C']]  # anaconda中ix已被弃用
---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

Cell In[64], line 1
----> 1 df.ix[:3,['A','C']]


File D:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py:5902, in NDFrame.__getattr__(self, name)
   5895 if (
   5896     name not in self._internal_names_set
   5897     and name not in self._metadata
   5898     and name not in self._accessors
   5899     and self._info_axis._can_hold_identifiers_and_holds_name(name)
   5900 ):
   5901     return self[name]
-> 5902 return object.__getattribute__(self, name)

AttributeError: 'DataFrame' object has no attribute 'ix'

Boolean indexing

df
ABCD
2024-01-010123
2024-01-024567
2024-01-03891011
2024-01-0412131415
2024-01-0516171819
2024-01-0620212223
df[df.A>8]
ABCD
2024-01-0412131415
2024-01-0516171819
2024-01-0620212223

pandas设置值

df
ABCD
2024-01-010123
2024-01-024567
2024-01-03891011
2024-01-0412131415
2024-01-0516171819
2024-01-0620212223
df.iloc[2,2]=1111
df
ABCD
2024-01-010123
2024-01-024567
2024-01-0389111111
2024-01-0412131415
2024-01-0516171819
2024-01-0620212223
df.loc['20240102','C']=2222
df
ABCD
2024-01-010123
2024-01-024522227
2024-01-0389111111
2024-01-0412131415
2024-01-0516171819
2024-01-0620212223
df[df.A>4]=0
df
ABCD
2024-01-010123
2024-01-024522227
2024-01-030000
2024-01-040000
2024-01-050000
2024-01-060000
dates = pd.date_range('20240101',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
df
ABCD
2024-01-010123
2024-01-024567
2024-01-03891011
2024-01-0412131415
2024-01-0516171819
2024-01-0620212223
df.A[df.A>4]=0
df
ABCD
2024-01-010123
2024-01-024567
2024-01-03091011
2024-01-040131415
2024-01-050171819
2024-01-060212223
dates = pd.date_range('20240101',periods=6)
dates
DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06'],
              dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
df
ABCD
2024-01-010123
2024-01-024567
2024-01-03891011
2024-01-0412131415
2024-01-0516171819
2024-01-0620212223
df.B[df.A>2]=0
df
ABCD
2024-01-010123
2024-01-024067
2024-01-03801011
2024-01-041201415
2024-01-051601819
2024-01-062002223
df['F']=np.nan
df
ABCDEF
2024-01-0101231NaN
2024-01-0245672NaN
2024-01-038910113NaN
2024-01-04121314154NaN
2024-01-05161718195NaN
2024-01-06202122236NaN
df['E']=pd.Series([1,2,3,4,5,6],index=pd.date_range('20240101',periods=6))
df
ABCDEF
2024-01-0101231NaN
2024-01-0245672NaN
2024-01-038910113NaN
2024-01-04121314154NaN
2024-01-05161718195NaN
2024-01-06202122236NaN
df['E']=pd.Series([1,2,3,4,5,6],index=df.index)
df
ABCDEF
2024-01-0101231NaN
2024-01-0245672NaN
2024-01-038910113NaN
2024-01-04121314154NaN
2024-01-05161718195NaN
2024-01-06202122236NaN

pandas处理丢失数据

import pandas as pd
import numpy as np
dates = pd.date_range('20240101',periods=6)
dates
DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06'],
              dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df
ABCD
2024-01-010123
2024-01-024567
2024-01-03891011
2024-01-0412131415
2024-01-0516171819
2024-01-0620212223
df.iloc[0,1]=np.nan
df.iloc[1,2]=np.nan
df
ABCD
2024-01-010NaN2.03
2024-01-0245.0NaN7
2024-01-0389.010.011
2024-01-041213.014.015
2024-01-051617.018.019
2024-01-062021.022.023
df.dropna(axis=0,how='any')  # 丢掉含nan的行数据
# how={'any','all'} 
# 'any' 只要有nan就丢掉;'all'只有全部是nan就丢掉
ABCD
2024-01-0389.010.011
2024-01-041213.014.015
2024-01-051617.018.019
2024-01-062021.022.023
df.dropna(axis=1,how='any')  # 丢掉含nan的列数据
AD
2024-01-0103
2024-01-0247
2024-01-03811
2024-01-041215
2024-01-051619
2024-01-062023
df
ABCD
2024-01-010NaN2.03
2024-01-0245.0NaN7
2024-01-0389.010.011
2024-01-041213.014.015
2024-01-051617.018.019
2024-01-062021.022.023
df.fillna(value=0)  #填补含nan的数据
ABCD
2024-01-0100.02.03
2024-01-0245.00.07
2024-01-0389.010.011
2024-01-041213.014.015
2024-01-051617.018.019
2024-01-062021.022.023
df.isnull()  #判断是否缺失数据(是否含有nan)
ABCD
2024-01-01FalseTrueFalseFalse
2024-01-02FalseFalseTrueFalse
2024-01-03FalseFalseFalseFalse
2024-01-04FalseFalseFalseFalse
2024-01-05FalseFalseFalseFalse
2024-01-06FalseFalseFalseFalse
np.any(df.isnull()) == True  # 判断整体数据是否含有nan数据
True

pandas导入导出数据

import pandas as pd
data = pd.read_csv('C:/Users/43160/Desktop/肝代码/Python/数据分析/实验数据/Advertising.csv')
data
NumberTVradionewspapersales
01230.137.869.222.1
1244.539.345.110.4
2317.245.969.39.3
34151.541.358.518.5
45180.810.858.412.9
..................
19519638.23.713.87.6
19619794.24.98.19.7
197198177.09.36.412.8
198199283.642.066.225.5
199200232.18.68.713.4

200 rows × 5 columns

data.to_csv('advertising.csv')  #数据保存

pandas合并DataFrame

1.concatenating

import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
df1
abcd
00.00.00.00.0
10.00.00.00.0
20.00.00.00.0
df2
abcd
01.01.01.01.0
11.01.01.01.0
21.01.01.01.0
df3
abcd
02.02.02.02.0
12.02.02.02.0
22.02.02.02.0
# 上下合并,即对行进行操作
res = pd.concat([df1,df2,df3],axis=0)
res
abcd
00.00.00.00.0
10.00.00.00.0
20.00.00.00.0
01.01.01.01.0
11.01.01.01.0
21.01.01.01.0
02.02.02.02.0
12.02.02.02.0
22.02.02.02.0
res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
res
abcd
00.00.00.00.0
10.00.00.00.0
20.00.00.00.0
31.01.01.01.0
41.01.01.01.0
51.01.01.01.0
62.02.02.02.0
72.02.02.02.0
82.02.02.02.0
#左右合并,即对列进行操作
res = pd.concat([df1,df2,df3],axis=1)
res
abcdabcdabcd
00.00.00.00.01.01.01.01.02.02.02.02.0
10.00.00.00.01.01.01.01.02.02.02.02.0
20.00.00.00.01.01.01.01.02.02.02.02.0

join,[‘inner’,‘outer’]

df1 = pd.DataFrame(np.ones((3,4))*0,index=[1,2,3],columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,index=[2,3,4],columns=['b','c','d','e'])
df1
abcd
10.00.00.00.0
20.00.00.00.0
30.00.00.00.0
df2
bcde
21.01.01.01.0
31.01.01.01.0
41.01.01.01.0
res = pd.concat([df1,df2])  # 默认"outer"模式
res
abcde
10.00.00.00.0NaN
20.00.00.00.0NaN
30.00.00.00.0NaN
2NaN1.01.01.01.0
3NaN1.01.01.01.0
4NaN1.01.01.01.0
res = pd.concat([df1,df2],join='outer')  # 类似并集
res
abcde
10.00.00.00.0NaN
20.00.00.00.0NaN
30.00.00.00.0NaN
2NaN1.01.01.01.0
3NaN1.01.01.01.0
4NaN1.01.01.01.0
res = pd.concat([df1,df2],join='inner')  # 类似交集
res
bcd
10.00.00.0
20.00.00.0
30.00.00.0
21.01.01.0
31.01.01.0
41.01.01.0
res = pd.concat([df1,df2],join='inner',ignore_index=True)  # 重新排列索引
res
bcd
00.00.00.0
10.00.00.0
20.00.00.0
31.01.01.0
41.01.01.0
51.01.01.0

join_axes

df1
abcd
10.00.00.00.0
20.00.00.00.0
30.00.00.00.0
df2
bcde
21.01.01.01.0
31.01.01.01.0
41.01.01.01.0
res = pd.concat([df1,df2],axis=1)  
res
abcdbcde
10.00.00.00.0NaNNaNNaNNaN
20.00.00.00.01.01.01.01.0
30.00.00.00.01.01.01.01.0
4NaNNaNNaNNaN1.01.01.01.0
res = pd.concat([df1,df2],axis=1,join_axes=[df1.index])  # 考虑df1的索引,但是已在anaconda中去除
res
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

Cell In[161], line 1
----> 1 res = pd.concat([df1,df2],axis=1,join_axes=[df1.index])  # 考虑df1的索引,但是已在anaconda中去除
      2 res


File D:\ProgramData\anaconda3\Lib\site-packages\pandas\util\_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    325 if len(args) > num_allow_args:
    326     warnings.warn(
    327         msg.format(arguments=_format_argument_list(allow_args)),
    328         FutureWarning,
    329         stacklevel=find_stack_level(),
    330     )
--> 331 return func(*args, **kwargs)


TypeError: concat() got an unexpected keyword argument 'join_axes'

2.append合并

df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df1
abcd
00.00.00.00.0
10.00.00.00.0
20.00.00.00.0
df2
abcd
01.01.01.01.0
11.01.01.01.0
21.01.01.01.0
res = df1.append(df2,ignore_index=True)
res
C:\Users\43160\AppData\Local\Temp\ipykernel_15804\3917667868.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  res = df1.append(df2,ignore_index=True)
abcd
00.00.00.00.0
10.00.00.00.0
20.00.00.00.0
31.01.01.01.0
41.01.01.01.0
51.01.01.01.0
df3 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
res = df1.append([df2,df3],ignore_index=True)
res
C:\Users\43160\AppData\Local\Temp\ipykernel_15804\3744420715.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  res = df1.append([df2,df3],ignore_index=True)
abcd
00.00.00.00.0
10.00.00.00.0
20.00.00.00.0
31.01.01.01.0
41.01.01.01.0
51.01.01.01.0
61.01.01.01.0
71.01.01.01.0
81.01.01.01.0
res = df1.append([df2,df3])
res
C:\Users\43160\AppData\Local\Temp\ipykernel_15804\1214992729.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  res = df1.append([df2,df3])
abcd
00.00.00.00.0
10.00.00.00.0
20.00.00.00.0
01.01.01.01.0
11.01.01.01.0
21.01.01.01.0
01.01.01.01.0
11.01.01.01.0
21.01.01.01.0
# 按行添加
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
s1 = pd.Series([1,2,3,4],index=['a','b','c','d']) 
s1
a    1
b    2
c    3
d    4
dtype: int64
res = df1.append(s1,ignore_index=True)
res
C:\Users\43160\AppData\Local\Temp\ipykernel_15804\2713288841.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  res = df1.append(s1,ignore_index=True)
abcd
00.00.00.00.0
10.00.00.00.0
20.00.00.00.0
31.02.03.04.0

3.merge合并

import pandas as pd
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})
right
keyCD
0K0C0D0
1K1C1D1
2K2C2D2
3K3C3D3
left
keyAB
0K0A0B0
1K1A1B1
2K2A2B2
3K3A3B3
res = pd.merge(left,right,on='key')   # 基于'key'进行合并
res
keyABCD
0K0A0B0C0D0
1K1A1B1C1D1
2K2A2B2C2D2
3K3A3B3C3D3

consider two keys

left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                             'key2': ['K0', 'K1', 'K0', 'K1'],
                             'A': ['A0', 'A1', 'A2', 'A3'],
                             'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                              'key2': ['K0', 'K0', 'K0', 'K0'],
                              'C': ['C0', 'C1', 'C2', 'C3'],
                              'D': ['D0', 'D1', 'D2', 'D3']})
left
key1key2AB
0K0K0A0B0
1K0K1A1B1
2K1K0A2B2
3K2K1A3B3
right
key1key2CD
0K0K0C0D0
1K1K0C1D1
2K1K0C2D2
3K2K0C3D3
res = pd.merge(left,right,on=['key1','key2'])
res
key1key2ABCD
0K0K0A0B0C0D0
1K1K0A2B2C1D1
2K1K0A2B2C2D2
res = pd.merge(left,right,on=['key1','key2'],how='inner')  # 默认inner
res
key1key2ABCD
0K0K0A0B0C0D0
1K1K0A2B2C1D1
2K1K0A2B2C2D2
# how={'inner','outer','right','left'}
res = pd.merge(left,right,on=['key1','key2'],how='outer')  
res
key1key2ABCD
0K0K0A0B0C0D0
1K0K1A1B1NaNNaN
2K1K0A2B2C1D1
3K1K0A2B2C2D2
4K2K1A3B3NaNNaN
5K2K0NaNNaNC3D3
left
key1key2AB
0K0K0A0B0
1K0K1A1B1
2K1K0A2B2
3K2K1A3B3
right
key1key2CD
0K0K0C0D0
1K1K0C1D1
2K1K0C2D2
3K2K0C3D3
res = pd.merge(left,right,on=['key1','key2'],how='left')  
res
key1key2ABCD
0K0K0A0B0C0D0
1K0K1A1B1NaNNaN
2K1K0A2B2C1D1
3K1K0A2B2C2D2
4K2K1A3B3NaNNaN
res = pd.merge(left,right,on=['key1','key2'],how='right')  
res
key1key2ABCD
0K0K0A0B0C0D0
1K1K0A2B2C1D1
2K1K0A2B2C2D2
3K2K0NaNNaNC3D3

indicator

# indicator
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
df1
col1col_left
00a
11b
df2
col1col_right
012
122
222
res = pd.merge(df1,df2,on='col1',how='outer',indicator=True)  # 显示merge方式是怎样merge的
res
col1col_leftcol_right_merge
00aNaNleft_only
11b2.0both
22NaN2.0right_only
32NaN2.0right_only
res = pd.merge(df1,df2,on='col1',how='outer',indicator='indicator_columns')  # 显示merge方式是怎样merge的
res
col1col_leftcol_rightindicator_columns
00aNaNleft_only
11b2.0both
22NaN2.0right_only
32NaN2.0right_only

index

left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                                  'B': ['B0', 'B1', 'B2']},
                                  index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                                     'D': ['D0', 'D2', 'D3']},
                                      index=['K0', 'K2', 'K3'])
left
AB
K0A0B0
K1A1B1
K2A2B2
right
CD
K0C0D0
K2C2D2
K3C3D3
# left_index and right_index
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
res
ABCD
K0A0B0C0D0
K1A1B1NaNNaN
K2A2B2C2D2
K3NaNNaNC3D3
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
res
ABCD
K0A0B0C0D0
K2A2B2C2D2
# handle overlapping
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
boys
kage
0K01
1K12
2K23
girls
kage
0K04
1K05
2K36
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')  #处理名字相同,但是内涵不同的数据用suffixes
res
kage_boyage_girl
0K014
1K015

pandas:数据可视化

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# plot data
#Series 一维数组,线性数据
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data
0      0.547677
1     -0.288794
2      0.556806
3      1.261752
4     -1.912560
         ...   
995    0.250478
996   -1.022430
997   -1.123374
998   -0.104338
999    1.049590
Length: 1000, dtype: float64
data =data.cumsum()  #累加
data.plot()    #输入数据
#plt.plot(x=,y=)   输入数据
plt.show()

在这里插入图片描述

# DataFrame
data = pd.DataFrame(np.random.randn(1000,4),
                   index=np.arange(1000),
                   columns=['A','B','C','D'])  #四个数据属性
data=data.cumsum()
data
ABCD
0-1.854020-1.0317260.8731531.601868
1-2.494261-1.2441280.5109322.150016
2-2.516531-2.961676-0.2848691.238185
3-1.974520-3.029144-0.2587071.761474
4-2.170233-2.9111060.0027381.778242
...............
995-15.542631-8.35745624.989268-3.500648
996-14.898920-7.75563924.748827-3.434445
997-15.438401-10.11508623.819015-2.865272
998-16.757351-9.94896424.401000-1.790440
999-18.415608-10.37750524.092952-2.959285

1000 rows × 4 columns

data.head()
ABCD
0-1.854020-1.0317260.8731531.601868
1-2.494261-1.2441280.5109322.150016
2-2.516531-2.961676-0.2848691.238185
3-1.974520-3.029144-0.2587071.761474
4-2.170233-2.9111060.0027381.778242
data.plot()
plt.show()

在这里插入图片描述

# plot methods:
# 'bar'条形图, 'hist', 'box', 'kde', 'area', 'scatter'散点图, 'hexbin', 'pie'

#scatter只有两个属性,意为散点图
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
plt.show()

在这里插入图片描述

ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)  # ax=ax表示将上面的属性添加到所展示的这条数据中
plt.show()

在这里插入图片描述

文章来源:https://blog.csdn.net/ccBcc_/article/details/135366905
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。