python大数据分析
数据分析:Python的Pandas库数据处理 #生活知识# #编程教程#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
!pip install xgboost
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import warnings
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
df = pd.read_csv('/home/mw/input/data1581/Sleep_health_and_lifestyle_dataset.csv',encoding='gbk')
df.head()
df.info()
df.duplicated().sum()df_new = df.copy()cat_cols = []for each in df_new.columns.tolist():
if df_new[each].dtype == 'object' and each != '血压':
cat_cols.append(each)
print(df_new[each].value_counts().to_frame())
le = LabelEncoder()
le.fit(df_new[col])
df_new[col] = le.transform(df_new[col])df_new.head()xueya = df_new['血压'].str.split('/',expand=True)
xueya.columns = ['高压','低压']
xueya = xueya.astype(int)df_new = pd.concat([df_new,xueya],axis=1)df_new.info()plt.figure(figsize=(12,8))
sns.countplot(x='性别',hue='职业',data=df,palette='Set3')
plt.title('男女及从事职业情况',fontsize=20)
plt.show()plt.figure(figsize=(12,8))
sns.countplot(x='性别',hue='睡眠障碍',data=df,palette='Set3')
plt.title('男女睡眠障碍情况',fontsize=20)
plt.show()plt.figure(figsize=(12,8))
sns.countplot(x='性别',hue='BMI',data=df,palette='Set3')
plt.title('男女BMI情况',fontsize=20)
plt.show()plt.figure(figsize=(12,8))
plt.hist(df['年龄'],density=True,bins=15,color=plt.cm.RdBu(0.6),edgecolor=plt.cm.RdBu(0.7))
df['年龄'].plot(kind = 'kde')
plt.title('年龄分布',fontsize=20)
plt.show()df['年龄'].min(),df['年龄'].max()plt.figure(figsize=(12,8))
plt.hist(df['睡眠时长'],density=True,bins=15,color=plt.cm.RdBu(0.6),edgecolor=plt.cm.RdBu(0.7))
df['睡眠时长'].plot(kind = 'kde')
plt.title('睡眠时间分布',fontsize=20)
plt.show()plt.figure(figsize=(12,8))
plt.hist(df['睡眠质量'],density=True,bins=6,color=plt.cm.RdBu(0.6),edgecolor=plt.cm.RdBu(0.7))
df['睡眠质量'].plot(kind = 'kde')
plt.title('睡眠质量分布',fontsize=20)
plt.show()plt.figure(figsize=(12,8))
plt.hist(df['心率'],density=True,bins=15,color=plt.cm.RdBu(0.6),edgecolor=plt.cm.RdBu(0.7))
df['心率'].plot(kind = 'kde')
plt.title('心率分布',fontsize=20)
plt.show()plt.figure(figsize=(12,8))
plt.hist(df_new['高压'],density=True,bins=15,color=plt.cm.RdBu(0.6),edgecolor=plt.cm.RdBu(0.7),label='高压')
df_new['高压'].plot(kind = 'kde',label='高压')
plt.hist(df_new['低压'],density=True,bins=15,color=plt.cm.RdBu(0.3),edgecolor=plt.cm.RdBu(0.2),label='低压')
df_new['低压'].plot(kind = 'kde',label='低压')
plt.title('血压分布',fontsize=20)
plt.legend()
plt.show()df_new['高压'].max(),df_new['高压'].min(),df_new['低压'].min(),df_new['低压'].max()plt.figure(figsize=(12,8))
plt.hist(df['身体活动水平'],density=True,bins=15,color=plt.cm.RdBu(0.6),edgecolor=plt.cm.RdBu(0.7))
df['身体活动水平'].plot(kind = 'kde')
plt.title('身体活动水平分布',fontsize=20)
plt.show()df['身体活动水平'].min(),df['身体活动水平'].max()plt.figure(figsize=(12,8))
plt.hist(df['压力水平'],density=True,bins=15,color=plt.cm.RdBu(0.6),edgecolor=plt.cm.RdBu(0.7))
df['压力水平'].plot(kind = 'kde')
plt.title('压力水平分布',fontsize=20)
plt.show()plt.figure(figsize=(12,8))
plt.hist(df['每日步数'],density=True,bins=15,color=plt.cm.RdBu(0.6),edgecolor=plt.cm.RdBu(0.7))
df['每日步数'].plot(kind = 'kde')
plt.title('每日步数分布',fontsize=20)
plt.show()sns.pairplot(df_new[df_new.columns.tolist()[1:]])plt.figure(figsize=(12,12))
plt.imshow(df_new.iloc[:,1:].corr(),cmap='Blues')
plt.xticks(range(len(df_new.iloc[:,1:].corr().columns.tolist())),df_new.iloc[:,1:].corr().columns.tolist(),rotation=45)
plt.yticks(range(len(df_new.iloc[:,1:].corr().columns.tolist())),df_new.iloc[:,1:].corr().columns.tolist(),rotation=45)
plt.colorbar()
plt.show()target = ['睡眠时长','睡眠质量','睡眠障碍']
df_new.drop(columns=['ID'],inplace=True)df_new.drop(columns=['血压'],inplace=True)for i in range(len(target[:2])):
y = df_new[target[i]]
X = df_new.iloc[:,~df_new.columns.isin(target)]
model = RandomForestRegressor()
model.fit(X,y)
print('在'+ target[i] + '作为因变量时,各因素重要性为:')
plt.figure(figsize=(8,8))
plt.subplot(2,1,i+1)
plt.imshow(model.feature_importances_.reshape(-1,1))
plt.yticks(range(len(X.columns.tolist())),X.columns.tolist())
plt.xticks(range(1))
plt.xlabel(target[i])
plt.colorbar()
plt.show()y = df_new[target[2]]
X = df_new.iloc[:,~df_new.columns.isin(target)]
model1 = RandomForestClassifier()
model1.fit(X,y)
plt.imshow(model1.feature_importances_.reshape(-1,1))
plt.yticks(range(len(X.columns.tolist())),X.columns.tolist())
plt.xticks(range(1))
plt.xlabel(target[2])
plt.colorbar()
plt.show()plt.figure(figsize=(4,8))
sns.boxplot(x='性别',y='身体活动水平',palette='Set3',data=df_new)
plt.title('不同性别身体活动水平的箱型图分析',fontsize=15)
plt.show()plt.figure(figsize=(4,8))
sns.boxplot(x='性别',y='压力水平',palette='Set3',data=df_new)
plt.title('不同性别压力水平的箱型图分析',fontsize=15)
plt.show()lt.figure(figsize=(4,8))
sns.boxplot(x='性别',y='心率',palette='Set3',data=df_new)
plt.title('不同性别心率的箱型图分析',fontsize=15)
plt.show()plt.figure(figsize=(12,8))
sns.boxplot(x='性别',y='高压',palette='Set3',data=df_new)
sns.boxplot(x='性别',y='低压',palette='Set3',data=df_new)
plt.title('不同性别血压的箱型图分析',fontsize=15)
plt.show()X = df_new.drop(columns=['睡眠障碍'])
y = df_new[['睡眠障碍']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=42)model2 = xgboost.XGBClassifier()
model2.fit(X_train, y_train)y_pred = model2.predict(X_test)df_new['睡眠障碍'].unique()cm = confusion_matrix(y_test, y_pred)label_mapping = {0:'失眠',1:'无',2:'睡眠呼吸暂停'}
for i, true_label in enumerate(label_mapping):
row = ''
for j, pred_label in enumerate(label_mapping.values()):
row += f'{cm[i, j]} ({pred_label})\t'
print(f'{row} | {true_label}')
print(classification_report(y_test, y_pred,target_names=['失眠','无', '睡眠呼吸暂停']))
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
ax.figure.colorbar(im, ax=ax)
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
xticklabels=label_names, yticklabels=label_names,
title='Confusion matrix',
ylabel='True label',
xlabel='Predicted label')
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, format(cm[i, j], 'd'),
ha="center", va="center",
color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
plt.show()
网址:python大数据分析 https://www.yuejiaxmz.com/news/view/202831
相关内容
Python数据分析实战Python数据分析:对饮食与健康数据的分析与可视化
python excel数据分析师职业技能
python数据分析
Python中的生活数据分析与个人健康监测.pptx
生活中的什么数据可以做数据分析
Python数据分析:统计函数绘制简单图形
django+hadoop基于Python的王者荣耀战队的数据分析系统(源码+文档+调试+可视化大屏)
利用Python进行数据分析——Pandas(2)
【Python】Python连接Hadoop数据中遇到的各种坑(汇总)