关注公众号:『AI学习星球
』
回复:母婴产品电商
即可获取数据下载。
算法学习
、4对1辅导
、论文辅导
或核心期刊
可以通过公众号
或?v:codebiubiubiu
滴滴我
今年来母婴的消费逐渐增加,这是一份关于淘宝天猫的一份母婴的销售数据。分析该数据集有利于了解目前市场的销售情况,便于做出运营决策,提高销售额。
此数据集中能得到的指标主要有:营运指标—>成交指标—>成交数量和成交用户数,会员指标—>会员复购率和平均购买次数。
此分析中根据场景对成交数量表述为销量或购买的数量等,对成交用户数表述成购买用户数或用户数等。此分析主要结合销量和购买用户数一起分析购买情况。
本次分析主要想探索下列问题:
数据来自淘宝和天猫上购买婴儿用品信息,本数据集包括两个文件:
真实数据集的数据量非常大,仅婴儿信息就有900多万,天池数据集只给出了一些样本数据。
表:包含29972行,7列。
tianchi_mum_baby_trade_history.csv
,一共29972行,7列
字段名字 | 字段含义 |
---|---|
user_id | 用户id |
auction_id | 购买行为编号 |
cat1 | 商品所属的大类 |
cat_id | 商品种类id,cat_id是cat1的子类,是更细分的类别 |
property | 商品属性 |
buy_mount | 购买数量 |
day | 购买时间 |
tianchi_mum_baby.csv
,一共954行,3列。
字段名字 | 字段含义 |
---|---|
user_id | 用户id |
birthday | 出生日期 |
gender | 性别(0 女孩,1 男孩,2性别不明) |
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = 'SimHei'
%matplotlib inline
df = pd.read_csv('tianchi_mum_baby_trade_history.csv', engine='python', parse_dates=['day'])
df.shape
(29971, 7)
df.info()
这里只有property这列的数据有异常,只有29827条数据,其他的数据都没异常,
property这列的数据不做处理,这列的数据较分散,难以分析
df.describe()
df[df.duplicated('user_id')].shape
(27, 7)
这里有27条重复的数据,进一步查看这些数据:
df[df.duplicated('user_id')].user_id
进一步查看user_id重复的原始数据:
df[df.user_id.isin(df[df.duplicated('user_id')].user_id)].sort_values(by='user_id', ascending=True)
df.drop(index=df[df.buy_mount>101].index, inplace=True)
关注公众号:『AI学习星球
』
回复:母婴产品电商
即可获取数据下载。
算法学习
、4对1辅导
、论文辅导
或核心期刊
可以通过公众号
或?v:codebiubiubiu
滴滴我
sale_num_top = df.groupby("cat_id").sum()['buy_mount'].sort_values(ascending=False).head(10).reset_index()
# 这里将cat_id转换成str类型,并在前面+个_是为了画图的时候,x轴不是按cat_id大小来排序的,
# 这会导致画的数据标签错位的。
sale_num_top['cat_id'] = sale_num_top['cat_id'].apply(lambda x:'_' + str(x))
# 可视化
import seaborn as sns
plt.figure(figsize=(6,4), dpi=128)
sns.barplot(x='cat_id', y='buy_mount', data=sale_num_top)
plt.xticks(rotation=70)
for a, b in enumerate(sale_num_top.buy_mount):
plt.text(a, b, b, va='center', ha='center', rotation=45)
order_num_top = df.groupby('cat_id').count()[
'user_id'].reset_index().rename(columns={'user_id':
'order_num'}).sort_values(
by='order_num', ascending=False).head(10)
order_num_top['cat_id'] = order_num_top['cat_id'].apply(lambda x:'_' + str(x))
#可视化
plt.figure(figsize=(6,4), dpi=128)
sns.barplot(x='cat_id', y='order_num', data=order_num_top)
plt.xticks(rotation=70)
for a, b in enumerate(order_num_top.order_num):
plt.text(a, b, b, va='center', ha='center', rotation=45)
cat1_sale_num = df.groupby('cat1').sum()['buy_mount'].reset_index()
cat1_sale_num['cat1'] = cat1_sale_num['cat1'].apply(lambda x:'_' + str(x))
#可视化
plt.figure(figsize=(6,4), dpi=128)
sns.barplot(x='buy_mount', y='cat1', data=cat1_sale_num)
plt.xticks(rotation=70)
for a, b in enumerate(cat1_sale_num.buy_mount):
plt.text(b, a, b, va='center', ha='center')
cat1_order_num = df.groupby('cat1').count()['user_id'].reset_index().rename(
columns={'user_id': 'order_num'}).sort_values(by='order_num', ascending=False)
cat1_order_num['cat1'] = cat1_order_num['cat1'].apply(lambda x:'_' + str(x))
#可视化
plt.figure(figsize=(6,4), dpi=128)
sns.barplot(x='order_num', y='cat1', data=cat1_order_num)
plt.xticks(rotation=70)
for a, b in enumerate(cat1_order_num.order_num):
plt.text(b, a, b, va='center', ha='center')
# 增加年这个辅助列
df['year'] = df['day'].dt.year
# 随机抽取10条数据看下辅助列是否正确添加
df.sample(10)
成功增加辅助列 - 年
year_sale_num = df.groupby('year').sum()['buy_mount'].reset_index()
#可视化
plt.figure(figsize=(6,4), dpi=128)
sns.barplot(x='year', y='buy_mount', data=year_sale_num)
plt.xticks(rotation=70)
for a, b in enumerate(year_sale_num.buy_mount):
plt.text(a, b, b, va='center', ha='center', rotation=45)
关注公众号:『AI学习星球
』
回复:母婴产品电商
即可获取数据下载。
算法学习
、4对1辅导
、论文辅导
或核心期刊
可以通过公众号
或?v:codebiubiubiu
滴滴我
year_order_num = df.groupby('year').count()['buy_mount'].reset_index()
# 可视化
plt.figure(figsize=(6,4), dpi=128)
sns.barplot(x='year', y='buy_mount', data=year_order_num)
plt.xticks(rotation=70)
for a, b in enumerate(year_order_num.buy_mount):
plt.text(a, b, b, va='center', ha='center', rotation=45)
增加付出列 - 月
df['month'] = df['day'].dt.month
df.sample(10)
成功添加辅助列 - 月
month_sale_num = df.groupby(['year', 'month']).sum()[
'buy_mount'].reset_index()
# 可视化
plt.figure(figsize=(6,4), dpi=128)
palette = sns.color_palette("mako_r", 4)
sns.lineplot(x='month', y='buy_mount', hue='year', data=month_sale_num, palette=palette)
month_order_num = df.groupby(['year', 'month']).count()[
'user_id'].reset_index().rename(columns={'user_id': 'order_num'})
#可视化
plt.figure(figsize=(6,4), dpi=128)
palette = sns.color_palette("mako_r", 4)
sns.lineplot(x='month', y='order_num', hue='year', data=month_order_num, palette=palette)
year_cat1_sale_num = df.groupby(['year', 'cat1']).sum()['buy_mount'].reset_index()
#窗口函数,按年求年销售数量的总和
year_cat1_sale_num['sum'] = year_cat1_sale_num.groupby('year')['buy_mount'].transform('sum')
year_cat1_sale_num['rate'] = round((year_cat1_sale_num['buy_mount'] / year_cat1_sale_num['sum']) * 100,2)
# 传入绘图Y轴的参数
y1 = year_cat1_sale_num[year_cat1_sale_num.cat1==28].rate
y2 = year_cat1_sale_num[year_cat1_sale_num.cat1==38].rate
y3 = year_cat1_sale_num[year_cat1_sale_num.cat1==50008168].rate
y4 = year_cat1_sale_num[year_cat1_sale_num.cat1==50014815].rate
y5 = year_cat1_sale_num[year_cat1_sale_num.cat1==50022520].rate
y6 = year_cat1_sale_num[year_cat1_sale_num.cat1==122650008].rate
# 柱状堆叠图
import pyecharts.options as opts
from pyecharts.charts import Bar
years = ['2012', '2013', '2014', '2015']
bar = (
Bar()
.add_xaxis(years)
.add_yaxis('28', list(y1), stack='stack1')
.add_yaxis('38', list(y2), stack='stack1')
.add_yaxis('50008168', list(y3), stack='stack1')
.add_yaxis('50014815', list(y4), stack='stack1')
.add_yaxis('50022520', list(y5), stack='stack1')
.add_yaxis('122650008', list(y6), stack='stack1')
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,
position='center',
color='black',
font_size=18,
formatter="{c} %"))
.set_global_opts(title_opts=opts.TitleOpts(title='每年各跟类别的销售数量占比'),
xaxis_opts=opts.AxisOpts(name='年份'),
yaxis_opts=opts.AxisOpts(name='各跟类别的销售数量占比'))
)
# bar.render('柱状堆叠图4.html')
bar.render_notebook()
year_cat1_order_num = df.groupby(['year', 'cat1']).count()[
'user_id'].reset_index().rename(columns={'user_id': 'order_num'})
#窗口函数,按年求年订单量的总和
year_cat1_order_num['sum'] = year_cat1_order_num.groupby(
'year')['order_num'].transform('sum')
#将其转换成十位数,并保留两位小数
year_cat1_order_num['rate'] = round((year_cat1_order_num[
'order_num'] / year_cat1_order_num['sum'])*100, 2)
#柱状图的y值
y1 = year_cat1_order_num[year_cat1_order_num.cat1==28].rate
y2 = year_cat1_order_num[year_cat1_order_num.cat1==38].rate
y3 = year_cat1_order_num[year_cat1_order_num.cat1==50008168].rate
y4 = year_cat1_order_num[year_cat1_order_num.cat1==50014815].rate
y5 = year_cat1_order_num[year_cat1_order_num.cat1==50022520].rate
y6 = year_cat1_order_num[year_cat1_order_num.cat1==122650008].rate
#可视化
years = ['2012', '2013', '2014', '2015']
bar = (
Bar()
.add_xaxis(years)
.add_yaxis('28', list(y1), stack='stack1')
.add_yaxis('38', list(y2), stack='stack1')
.add_yaxis('50008168', list(y3), stack='stack1')
.add_yaxis('50014815', list(y4), stack='stack1')
.add_yaxis('50022520', list(y5), stack='stack1')
.add_yaxis('122650008', list(y6), stack='stack1')
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,
position='center',
color='black',
font_size=18,
formatter="{c} %"))
.set_global_opts(title_opts=opts.TitleOpts(title='每年各跟类别的订单量占比'),
xaxis_opts=opts.AxisOpts(name='年份'),
yaxis_opts=opts.AxisOpts(name='各跟类别的订单量占比'))
)
# bar.render('订单量1.html')
bar.render_notebook()
baby_df = pd.read_csv('tianchi_mum_baby.csv', engine='python', parse_dates=['birthday'])
baby_df.shape
(953, 3)
baby_df.info()
没有缺失值
baby_df.user_id.unique().size
953
性别变量的三个值:0女性,1男性,2未知
baby_df[baby_df.gender==2].shape
(26, 3)
性别未知的有26人,占了总数的2.7%,直接删除处理
baby_df.drop(index=baby_df[baby_df.gender==2].index, inplace=True)
sample_df = pd.merge(df, baby_df, on='user_id', how='inner')
sample_df.shape
(929, 11)
sample_df[sample_df.duplicated('user_id')]
进一步查看所有重复数据
sample_df[sample_df.user_id.isin(['116466705', '213455117', '69889555'])]
可以看出这些是重复购买的数据,没有异常。
def parse_gender(x):
if x== 0:
return "女"
else:
return "男"
sample_df['gender'] = sample_df['gender'].apply(parse_gender)
sample_df
用户性别分布情况
plt.figure(figsize=(6, 4), dpi=128)
sample_df.groupby('gender').count()['user_id'].plot(kind='pie', autopct='%.2f%%')
关注公众号:『AI学习星球
』
回复:母婴产品电商
即可获取数据下载。
算法学习
、4对1辅导
、论文辅导
或核心期刊
可以通过公众号
或?v:codebiubiubiu
滴滴我
gender_sale_num = sample_df.groupby('gender').sum()['buy_mount'].reset_index()
# 可视化
plt.figure(figsize=(6,4), dpi=128)
sns.barplot(x='gender', y='buy_mount', data=gender_sale_num)
plt.xticks(rotation=70)
for a, b in enumerate(gender_sale_num.buy_mount):
plt.text(a, b, b, va='center', ha='center', rotation=45)
gender_order_num = sample_df.groupby('gender').count()[
'user_id'].reset_index().rename(columns={'user_id': 'order_num'})
#可视化
plt.figure(figsize=(6,4), dpi=128)
sns.barplot(x='gender', y='order_num', data=gender_order_num)
plt.xticks(rotation=70)
for a, b in enumerate(gender_order_num.order_num):
plt.text(a, b, b, va='center', ha='center', rotation=45)
gender_cat1_sale_num = sample_df.groupby(['gender', 'cat1']).sum()[
'buy_mount'].reset_index()
gender_cat1_sale_num['sum'] = gender_cat1_sale_num.groupby(
'gender')['buy_mount'].transform('sum')
gender_cat1_sale_num['rate'] = round((gender_cat1_sale_num['buy_mount'] /
gender_cat1_sale_num['sum'])*100, 2)
y1 = gender_cat1_sale_num[gender_cat1_sale_num.cat1==28].rate
y2 = gender_cat1_sale_num[gender_cat1_sale_num.cat1==38].rate
y3 = gender_cat1_sale_num[gender_cat1_sale_num.cat1==50008168].rate
y4 = gender_cat1_sale_num[gender_cat1_sale_num.cat1==50014815].rate
y5 = gender_cat1_sale_num[gender_cat1_sale_num.cat1==50022520].rate
y6 = gender_cat1_sale_num[gender_cat1_sale_num.cat1==122650008].rate
years = ['女', '男']
bar = (
Bar()
.add_xaxis(years)
.add_yaxis('28', list(y1), stack='stack1')
.add_yaxis('38', list(y2), stack='stack1')
.add_yaxis('50008168', list(y3), stack='stack1')
.add_yaxis('50014815', list(y4), stack='stack1')
.add_yaxis('50022520', list(y5), stack='stack1')
.add_yaxis('122650008', list(y6), stack='stack1')
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,
position='center',
color='black',
font_size=18,
formatter="{c} %"))
.set_global_opts(title_opts=opts.TitleOpts(title='各性别销售数量中的跟类别占比'),
xaxis_opts=opts.AxisOpts(name='年份'),
yaxis_opts=opts.AxisOpts(name='各跟类别的销售数量占比'))
)
# bar.render('各性别销售数量中的跟类别占比1.html')
bar.render_notebook()
gender_cat1_order_num = sample_df.groupby(['gender', 'cat1']).count()[
'user_id'].reset_index().rename(columns={'user_id': 'order_num'})
gender_cat1_order_num['sum'] = gender_cat1_order_num.groupby(
'gender')['order_num'].transform('sum')
gender_cat1_order_num['rate'] = round((gender_cat1_order_num['order_num'] /
gender_cat1_order_num['sum'])*100, 2)
y1 = gender_cat1_order_num[gender_cat1_order_num.cat1==28].rate
y2 = gender_cat1_order_num[gender_cat1_order_num.cat1==38].rate
y3 = gender_cat1_order_num[gender_cat1_order_num.cat1==50008168].rate
y4 = gender_cat1_order_num[gender_cat1_order_num.cat1==50014815].rate
y5 = gender_cat1_order_num[gender_cat1_order_num.cat1==50022520].rate
y6 = gender_cat1_order_num[gender_cat1_order_num.cat1==122650008].rate
years = ['女', '男']
bar = (
Bar()
.add_xaxis(years)
.add_yaxis('28', list(y1), stack='stack1')
.add_yaxis('38', list(y2), stack='stack1')
.add_yaxis('50008168', list(y3), stack='stack1')
.add_yaxis('50014815', list(y4), stack='stack1')
.add_yaxis('50022520', list(y5), stack='stack1')
.add_yaxis('122650008', list(y6), stack='stack1')
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,
position='center',
color='black',
font_size=18,
formatter="{c} %"))
.set_global_opts(title_opts=opts.TitleOpts(title='各性别订单量中的跟类别占比'),
xaxis_opts=opts.AxisOpts(name='年份'),
yaxis_opts=opts.AxisOpts(name='各跟类别的订单量占比'))
)
# bar.render('各性别订单量中的跟类别占比1.html')
bar.render_notebook()
sample_df['age'] = sample_df['birthday'].apply(lambda x: 2020 - int(x.year))
sample_df.sample(5)
def parse_age_to(age):
if age <3:
return "3岁以下"
elif age<6:
return "3-5岁"
elif age<9:
return "6-8岁"
elif age<12:
return "9-11岁"
elif age<15:
return "12-14岁"
else:
return "15岁以上"
sample_df['age_type'] = sample_df['age'].apply(parse_age_to)
sample_df.sample(5)
age_type_user_num = sample_df.groupby('age_type').count()['user_id'].reset_index().rename(columns={'user_id': 'order_num'})
sns.scatterplot(x='age_type', y='order_num', size='order_num', data=age_type_user_num)
age_type_sale_num = sample_df.groupby('age_type').sum()[
'buy_mount'].reset_index()
#可视化
plt.figure(figsize=(6,4), dpi=128)
sns.barplot(x='age_type', y='buy_mount', data=age_type_sale_num)
plt.xticks(rotation=70)
for a, b in enumerate(age_type_sale_num.buy_mount):
plt.text(a, b, b, va='center', ha='center', rotation=45)
关注公众号:『AI学习星球
』
回复:母婴产品电商
即可获取数据下载。
算法学习
、4对1辅导
、论文辅导
或核心期刊
可以通过公众号
或?v:codebiubiubiu
滴滴我