<<<<<<< HEAD ======= >>>>>>> 6cbbb7eade37dee30b9807f3594b0c1fd3e6305a Compute Tiger - HW5

HW5

bigdata
Author

tiger

Published
<<<<<<< HEAD

July 30, 2024

=======

July 29, 2024

>>>>>>> 6cbbb7eade37dee30b9807f3594b0c1fd3e6305a
<<<<<<< HEAD
#파일 불러오기, 컬럼명 변경경
import pandas as pd
import seaborn as sns    
import numpy as np    
import matplotlib.pyplot as plt    
!pip install pyreadstat
raw_welfare=pd.read_spss('data/Koweps_hpwc14_2019_beta2.sav')
welfare=raw_welfare.copy()

welfare=welfare.rename(
    columns={"h14_g3": "sex",
             "h14_g4": "birth",
             "h14_g10": "marriage_type",
             "h14_g11": "religion",
             "p1402_8aq1": "income",
             "h14_eco9" : "code_job",
             "h14_reg7" : "code_region"})
             
welfare=welfare[["sex","birth","marriage_type","religion","income","code_job","code_region"]]

[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip
Requirement already satisfied: pyreadstat in c:\ds\python\python312\lib\site-packages (1.2.7)
Requirement already satisfied: pandas>=1.2.0 in c:\ds\python\python312\lib\site-packages (from pyreadstat) (2.2.2)
Requirement already satisfied: numpy>=1.26.0 in c:\ds\python\python312\lib\site-packages (from pandas>=1.2.0->pyreadstat) (2.0.0)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\ds\python\python312\lib\site-packages (from pandas>=1.2.0->pyreadstat) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in c:\ds\python\python312\lib\site-packages (from pandas>=1.2.0->pyreadstat) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in c:\ds\python\python312\lib\site-packages (from pandas>=1.2.0->pyreadstat) (2024.1)
Requirement already satisfied: six>=1.5 in c:\ds\python\python312\lib\site-packages (from python-dateutil>=2.8.2->pandas>=1.2.0->pyreadstat) (1.16.0)
# 성별 전처리
welfare["sex"].value_counts()
welfare["sex"].isna().sum()
welfare["sex"]=np.where(welfare["sex"]==1.0, "male","female")
# 월급 전처리 & 평균 , 시각화
sex_income=welfare.dropna(subset="income").groupby("sex",as_index=False)[["income"]].mean() 
sns.barplot(data=sex_income, x="sex", y="income")
plt.show()

=======

표본 분산 계산 시 왜 n-1로 나누는지 알아보도록 하겠습니다. 균일분포 (3, 7)에서 20개의 표본을 뽑아서 분산을 2가지 방법으로 추정해보세요.


1. n-1로 나눈 것을 s_2, n으로 나눈 것을 k_2로 정의하고, s_2의 분포와 k_2의 분포를 그려주세요! (10000개 사용)

import numpy as np    
import pandas as pd
import matplotlib.pyplot as plt    
import seaborn as sns    

s_2=[]
k_2=[]

for i in range(10000):
    a=np.random.uniform(3,7,20)
    b=sum((a-a.mean())**2)
    c_1=b/(len(a)-1)
    c=b/(len(a))
    s_2.append(c_1)
    k_2.append(c)
sns.histplot(data=s_2, color="red")
sns.histplot(data=k_2, color="y")
plt.show()

>>>>>>> 6cbbb7eade37dee30b9807f3594b0c1fd3e6305a
<<<<<<< HEAD

각 성별 95% 신뢰구간 계산후 그리기. norm.ppf() 사용해서 그릴 것. 모분산은 표본 분산을 사용해서 추정

from scipy.stats import *
all_income=welfare.dropna(subset="income")
f_income=all_income.loc[all_income["sex"]=="female","income"]
m_income=all_income.loc[all_income["sex"]=="male","income"]

f_mean=f_income.mean()
m_mean=m_income.mean()

f_std=f_income.std()
m_std=m_income.std()

f_n = len(f_income)
m_n = len(m_income)
z=norm.ppf(0.95, loc=0, scale=1)

f_max=f_mean+z*f_std/np.sqrt(f_n)
f_min=f_mean-z*f_std/np.sqrt(f_n)

m_max=m_mean+z*m_std/np.sqrt(m_n)
m_min=m_mean-z*m_std/np.sqrt(m_n)
plt.clf()
sns.barplot(data=sex_income, x="sex", y="income")
plt.plot([1, 1], [m_min, m_max], 'r-')
plt.plot([0, 0], [f_min, f_max], 'r-')
plt.show()

=======

2. 각 분포 그래프에 모분산의 위치에 녹색 막대를 그려주세요.

var=((7-3)**2)/12
plt.axvline(var, color = 'green', linestyle = '-',  linewidth = 3)
sns.histplot(data=s_2, color="red")
plt.show()
print("n-1로 나눈 것")

n-1로 나눈 것
plt.axvline(var, color = 'green', linestyle = '-',  linewidth = 3)
sns.histplot(data=k_2, color="y")
plt.show()
print("n으로 나눈 것")

n으로 나눈 것

3. 결과를 살펴보고, 왜 n-1로 나눈 것을 분산을 추정하는 지표로

사용하는 것이 타당한지 써주세요!

plt.axvline(var, color = 'green', linestyle = '-',  linewidth = 3)
sns.histplot(data=s_2, color="red")
sns.histplot(data=k_2, color="y")
plt.show()

>>>>>>> 6cbbb7eade37dee30b9807f3594b0c1fd3e6305a