# 导入要用到的包和进行一些基础设置,这段代码拷贝即可
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
# 默认用seaborn的绘图样式
sns.set_theme()
"font.sans-serif"] = [
plt.rcParams["Microsoft YaHei"
# 设置字体。如果不设置,中文会乱码。这里采用微软雅黑'Microsoft YaHei',如果显示不正常,也可以使用黑体'SimHei'或者宋体'SimSun'等
] "axes.unicode_minus"] = False # 该语句解决图像中的“-”负号的乱码问题
plt.rcParams[
# 绘图使用'svg'后端:svg是矢量格式,可以任意缩放均保持清晰,各种屏幕的显示良好。
%config InlineBackend.figure_formats = ['svg']
# 生成数据并绘图 ==========================
= np.linspace(1, 7, 50)
x = 3 + 2 * x + 1.5 * np.random.randn(len(x))
y = pd.DataFrame({"xData": x, "yData": y})
df ="xData", y="yData")
sns.regplot(df, x plt.show()
第2部分 统计学
- 基本统计检验
- 参数检验
- 方程分析
- 非参数检验
- 相关性和回归
- 聚类分析
- 因子分析
- 判别分析(分类)
需要用到的工具
seaborn绘图包
统计学包
pingouin:一个常用的统计学包。首次运行,可以在一个单元格中执行以下命令按照装这个包。注意安装之后就不需要再执行了。
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pingouin
import pingouin as pg
from scipy import stats
# 生成虚拟数据
123)
np.random.seed(= np.arange(100)
x = 1.5 * x + 50 + 10 * np.random.randn(len(x))
y
# 用pg做一个简单线性回归
= pg.linear_regression(x, y).round(2)
lm lm
names | coef | se | T | pval | r2 | adj_r2 | CI[2.5%] | CI[97.5%] | |
---|---|---|---|---|---|---|---|---|---|
0 | Intercept | 50.13 | 2.26 | 22.16 | 0.0 | 0.94 | 0.94 | 45.64 | 54.62 |
1 | x1 | 1.50 | 0.04 | 38.06 | 0.0 | 0.94 | 0.94 | 1.42 | 1.58 |
statsmodels,另一个常用统计学工具包
import statsmodels.formula.api as sm
# Generate noisy line, and save data in a pd-DataFrame
= np.arange(100)
x = 0.5 * x - 20 + np.random.randn(len(x))
y = pd.DataFrame({"x": x, "y": y})
df # Fit a linear model, using the "formula" language
# added by the package "patsy"
= sm.ols("y~x", data=df).fit()
model print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.995
Model: OLS Adj. R-squared: 0.995
Method: Least Squares F-statistic: 2.164e+04
Date: Thu, 11 Apr 2024 Prob (F-statistic): 8.94e-117
Time: 10:48:09 Log-Likelihood: -138.22
No. Observations: 100 AIC: 280.4
Df Residuals: 98 BIC: 285.6
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -19.8325 0.193 -102.607 0.000 -20.216 -19.449
x 0.4962 0.003 147.110 0.000 0.490 0.503
==============================================================================
Omnibus: 5.025 Durbin-Watson: 1.894
Prob(Omnibus): 0.081 Jarque-Bera (JB): 4.660
Skew: -0.369 Prob(JB): 0.0973
Kurtosis: 3.758 Cond. No. 114.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.