#1. 데이터 불러오기~! (코드는 ML with python 깃허브 자료에서 가져왔다.)

import numpy as np
import pandas as pd
import mglearn
import os
# 이 파일은 열 이름을 나타내는 헤더가 없으므로 header=None으로 지정하고
# "names" 매개변수로 열 이름을 제공합니다
data = pd.read_csv(
    os.path.join(mglearn.datasets.DATA_PATH, "adult.data"), header=None, index_col=False,
    names=['age', 'workclass', 'fnlwgt', 'education',  'education-num',
           'marital-status', 'occupation', 'relationship', 'race', 'gender',
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
           'income'])
# 예제를 위해 몇개의 열만 선택합니다
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week',
             'occupation', 'income']]
# IPython.display 함수는 주피터 노트북을 위해 포맷팅된 출력을 만듭니다
display(data.head())


#2. 문자열로 된 범주형 데이터 불러오기

# 이 데이터셋의 '성별' 범주형 속성을 구성하는 Male, Female 두 속성값을 확인하자
# value_count()메서드를 활용하면 각 속성 안의 유일한 값을 알 수 있다.
print(data.gender.value_counts())

 Male      21790
 Female    10771
Name: gender, dtype: int64


#3. 범주형 변수(gender) 원-핫-인코딩으로 바꾸기. 

# get_dummies 함수를 사용해 인코딩한다.
# 여기서 인코딩encoding이란, 정보의 형태, 형식을 변환하는 처리 방식을 말한다.
# get_dummies 함수를 data에 사용한다는 것은 각각의 속성 안에 있는 범주형 속성값들을 인코딩해 각각을 하나의 새로운 특성으로 변환하는 것을 의미
#결과를 살펴보면 기존의 속성들이 속성값에 따라 좀 더 세분화된 속성으로 분화되는 것을 확인할 수 있다.

print("원본 특성:\n", list(data.columns), "\n")
data_dummies = pd.get_dummies(data)
print("get_dummies 후의 특성:\n", list(data_dummies.columns))

#이때 이산형(연속형) 속성은 인코딩 후에도 모양이 그대로인 것을 확인할 수 있다.

원본 특성:
 ['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income'] 

get_dummies 후의 특성:
 ['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'gender_ Female', 'gender_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-serv', 'occupation_ Sales', 'occupation_ Tech-support', 'occupation_ Transport-moving', 'income_ <=50K', 'income_ >50K']


#4. 원-핫-인코딩 후 변화 확인하기

display(data_dummies.head())


#5. 인코딩한 데이터 타깃 속성 제외하고 추출한 다음 numpy배열로 바꿔주기
features = data_dummies.loc[:, 'age': 'occupation_ Transport-moving']
#NumPy 배열 추출. 넘파이 배열은 범위의 맨 마지막 값을 포함하지 않는다.

X = features.values
y = data_dummies['income_ >50K'].values #넘파이 배열이므로 income_ >50K를 포함하지 않는 범위를 의미
                                        #.values 속성을 사용해서 테이터프레임을 넘파이 배열로 바꿀 수 있다.
print("X.shape: {} y.shape: {}".format(X.shape, y.shape))

X.shape: (32561, 44) y.shape: (32561,)


#6. 변환한 데이터로 모델 학습하기. 기존의 연속형 데이터와 과정이 같다.

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegression(max_iter=5000)
logreg.fit(X_train, y_train)
print("테스트 점수: {:.2f}".format(logreg.score(X_test, y_test)))

테스트 점수: 0.81

#13. 범주형 데이터를 인코딩하기 | one-hot-encoding

Table of Contents

1. 특성 공학

2. 범주형 특성 다루기

2.2 범주형 변수를 표현하는 방법: 원-핫-인코딩one-hot-encoding

	age	workclass	education	gender	hours-per-week	occupation	income
0	39	State-gov	Bachelors	Male	40	Adm-clerical	<=50K
1	50	Self-emp-not-inc	Bachelors	Male	13	Exec-managerial	<=50K
2	38	Private	HS-grad	Male	40	Handlers-cleaners	<=50K
3	53	Private	11th	Male	40	Handlers-cleaners	<=50K
4	28	Private	Bachelors	Female	40	Prof-specialty	<=50K

	age	hours-per-week	workclass_ Private	workclass_ Self-emp-not-inc	workclass_ State-gov	...	occupation_ Prof-specialty	income_ <=50K
0	39	40	0	0	1	...	0	1
1	50	13	0	1	0	...	0	1
2	38	40	1	0	0	...	0	1
3	53	40	1	0	0	...	0	1
4	28	40	1	0	0	...	1	1

	age	hours-per-week	workclass_ Private	workclass_ Self-emp-not-inc	workclass_ State-gov	...	occupation_ Prof-specialty	income_ <=50K
0	39	40	0	0	1	...	0	1
1	50	13	0	1	0	...	0	1
2	38	40	1	0	0	...	0	1
3	53	40	1	0	0	...	0	1
4	28	40	1	0	0	...	1	1

	age	hours-per-week	workclass_ Private	workclass_ Self-emp-not-inc	workclass_ State-gov	...	occupation_ Prof-specialty	income_ <=50K
0	39	40	0	0	1	...	0	1
1	50	13	0	1	0	...	0	1
2	38	40	1	0	0	...	0	1
3	53	40	1	0	0	...	0	1
4	28	40	1	0	0	...	1	1