python大数据分析

发布时间：2024-11-23 03:47

数据分析：Python的Pandas库数据处理 #生活知识# #编程教程#

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

!pip install xgboost

import pandas as pd

import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder

import warnings

import xgboost

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

warnings.filterwarnings('ignore')

plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']

df = pd.read_csv('/home/mw/input/data1581/Sleep_health_and_lifestyle_dataset.csv',encoding='gbk')

df.head()

df.info()

df.duplicated().sum()df_new = df.copy()cat_cols = []for each in df_new.columns.tolist():

if df_new[each].dtype == 'object' and each != '血压':

cat_cols.append(each)

print(df_new[each].value_counts().to_frame())

le = LabelEncoder()

le.fit(df_new[col])

df_new[col] = le.transform(df_new[col])df_new.head()xueya = df_new['血压'].str.split('/',expand=True)

xueya.columns = ['高压','低压']

xueya = xueya.astype(int)df_new = pd.concat([df_new,xueya],axis=1)df_new.info()plt.figure(figsize=(12,8))

sns.countplot(x='性别',hue='职业',data=df,palette='Set3')

plt.title('男女及从事职业情况',fontsize=20)

plt.show()plt.figure(figsize=(12,8))

sns.countplot(x='性别',hue='睡眠障碍',data=df,palette='Set3')

plt.title('男女睡眠障碍情况',fontsize=20)

plt.show()plt.figure(figsize=(12,8))

sns.countplot(x='性别',hue='BMI',data=df,palette='Set3')

plt.title('男女BMI情况',fontsize=20)

plt.show()plt.figure(figsize=(12,8))

plt.hist(df['年龄'],density=True,bins=15,color=plt.cm.RdBu(0.6),edgecolor=plt.cm.RdBu(0.7))

df['年龄'].plot(kind = 'kde')

plt.title('年龄分布',fontsize=20)

plt.show()df['年龄'].min(),df['年龄'].max()plt.figure(figsize=(12,8))

plt.hist(df['睡眠时长'],density=True,bins=15,color=plt.cm.RdBu(0.6),edgecolor=plt.cm.RdBu(0.7))

df['睡眠时长'].plot(kind = 'kde')

plt.title('睡眠时间分布',fontsize=20)

plt.show()plt.figure(figsize=(12,8))

plt.hist(df['睡眠质量'],density=True,bins=6,color=plt.cm.RdBu(0.6),edgecolor=plt.cm.RdBu(0.7))

df['睡眠质量'].plot(kind = 'kde')

plt.title('睡眠质量分布',fontsize=20)

plt.show()plt.figure(figsize=(12,8))

plt.hist(df['心率'],density=True,bins=15,color=plt.cm.RdBu(0.6),edgecolor=plt.cm.RdBu(0.7))

df['心率'].plot(kind = 'kde')

plt.title('心率分布',fontsize=20)

plt.show()plt.figure(figsize=(12,8))

plt.hist(df_new['高压'],density=True,bins=15,color=plt.cm.RdBu(0.6),edgecolor=plt.cm.RdBu(0.7),label='高压')

df_new['高压'].plot(kind = 'kde',label='高压')

plt.hist(df_new['低压'],density=True,bins=15,color=plt.cm.RdBu(0.3),edgecolor=plt.cm.RdBu(0.2),label='低压')

df_new['低压'].plot(kind = 'kde',label='低压')

plt.title('血压分布',fontsize=20)

plt.legend()

plt.show()df_new['高压'].max(),df_new['高压'].min(),df_new['低压'].min(),df_new['低压'].max()plt.figure(figsize=(12,8))

plt.hist(df['身体活动水平'],density=True,bins=15,color=plt.cm.RdBu(0.6),edgecolor=plt.cm.RdBu(0.7))

df['身体活动水平'].plot(kind = 'kde')

plt.title('身体活动水平分布',fontsize=20)

plt.show()df['身体活动水平'].min(),df['身体活动水平'].max()plt.figure(figsize=(12,8))

plt.hist(df['压力水平'],density=True,bins=15,color=plt.cm.RdBu(0.6),edgecolor=plt.cm.RdBu(0.7))

df['压力水平'].plot(kind = 'kde')

plt.title('压力水平分布',fontsize=20)

plt.show()plt.figure(figsize=(12,8))

plt.hist(df['每日步数'],density=True,bins=15,color=plt.cm.RdBu(0.6),edgecolor=plt.cm.RdBu(0.7))

df['每日步数'].plot(kind = 'kde')

plt.title('每日步数分布',fontsize=20)

plt.show()sns.pairplot(df_new[df_new.columns.tolist()[1:]])plt.figure(figsize=(12,12))

plt.imshow(df_new.iloc[:,1:].corr(),cmap='Blues')

plt.xticks(range(len(df_new.iloc[:,1:].corr().columns.tolist())),df_new.iloc[:,1:].corr().columns.tolist(),rotation=45)

plt.yticks(range(len(df_new.iloc[:,1:].corr().columns.tolist())),df_new.iloc[:,1:].corr().columns.tolist(),rotation=45)

plt.colorbar()

plt.show()target = ['睡眠时长','睡眠质量','睡眠障碍']

df_new.drop(columns=['ID'],inplace=True)df_new.drop(columns=['血压'],inplace=True)for i in range(len(target[:2])):

y = df_new[target[i]]

X = df_new.iloc[:,~df_new.columns.isin(target)]

model = RandomForestRegressor()

model.fit(X,y)

print('在'+ target[i] + '作为因变量时，各因素重要性为：')

plt.figure(figsize=(8,8))

plt.subplot(2,1,i+1)

plt.imshow(model.feature_importances_.reshape(-1,1))

plt.yticks(range(len(X.columns.tolist())),X.columns.tolist())

plt.xticks(range(1))

plt.xlabel(target[i])

plt.colorbar()

plt.show()y = df_new[target[2]]

X = df_new.iloc[:,~df_new.columns.isin(target)]

model1 = RandomForestClassifier()

model1.fit(X,y)

plt.imshow(model1.feature_importances_.reshape(-1,1))

plt.yticks(range(len(X.columns.tolist())),X.columns.tolist())

plt.xticks(range(1))

plt.xlabel(target[2])

plt.colorbar()

plt.show()plt.figure(figsize=(4,8))

sns.boxplot(x='性别',y='身体活动水平',palette='Set3',data=df_new)

plt.title('不同性别身体活动水平的箱型图分析',fontsize=15)

plt.show()plt.figure(figsize=(4,8))

sns.boxplot(x='性别',y='压力水平',palette='Set3',data=df_new)

plt.title('不同性别压力水平的箱型图分析',fontsize=15)

plt.show()lt.figure(figsize=(4,8))

sns.boxplot(x='性别',y='心率',palette='Set3',data=df_new)

plt.title('不同性别心率的箱型图分析',fontsize=15)

plt.show()plt.figure(figsize=(12,8))

sns.boxplot(x='性别',y='高压',palette='Set3',data=df_new)

sns.boxplot(x='性别',y='低压',palette='Set3',data=df_new)

plt.title('不同性别血压的箱型图分析',fontsize=15)

plt.show()X = df_new.drop(columns=['睡眠障碍'])

y = df_new[['睡眠障碍']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=42)model2 = xgboost.XGBClassifier()

model2.fit(X_train, y_train)y_pred = model2.predict(X_test)df_new['睡眠障碍'].unique()cm = confusion_matrix(y_test, y_pred)label_mapping = {0:'失眠',1:'无',2:'睡眠呼吸暂停'}

for i, true_label in enumerate(label_mapping):

row = ''

for j, pred_label in enumerate(label_mapping.values()):

row += f'{cm[i, j]} ({pred_label})\t'

print(f'{row} | {true_label}')

print(classification_report(y_test, y_pred,target_names=['失眠','无', '睡眠呼吸暂停']))

fig, ax = plt.subplots()

im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)

ax.figure.colorbar(im, ax=ax)

ax.set(xticks=np.arange(cm.shape[1]),

yticks=np.arange(cm.shape[0]),

xticklabels=label_names, yticklabels=label_names,

title='Confusion matrix',

ylabel='True label',

xlabel='Predicted label')

thresh = cm.max() / 2.

for i in range(cm.shape[0]):

for j in range(cm.shape[1]):

ax.text(j, i, format(cm[i, j], 'd'),

ha="center", va="center",

color="white" if cm[i, j] > thresh else "black")

fig.tight_layout()

plt.show()

网址：python大数据分析 https://www.yuejiaxmz.com/news/view/202831

⬅️上一篇：建设15分钟社区生活圈对于城镇不

➡️下一篇：方芳的快乐生活

python大数据分析

相关内容

随便看看

最新动态分享

热点动态分享

专题

推荐动态分享