본문 바로가기

데이터 청년 캠퍼스(경남대학교)/스터디

2021-07-19

Wine Quality 분류 예측

클래스 불균형 데이터

클래스 불균형 기법을 이해하고 Accuracy 향상

 

import pandas as pd
import numpy as np

# 한글 폰트 적용
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus']=False
path='c:/Windows/Fonts/malgun.ttf'
font_name=font_manager.FontProperties(fname=path).get_name()
rc('font',family=font_name)
wine = pd.read_csv('winequalityN_imblance.csv')
wine.head()
wine.info()

type 열만 object 타입

 

wine.isnull().sum()

fixed_acidity 10개/ volatile_acidity 8개 / citric_acid 3개/ residual_sugar 2개 /chlorides  2개/ pH  9개/

sulphates 4개의 결측치

 

wine = wine.dropna()
wine.info()

결측치는 모두 drop

 

wine['type']=wine['type'].replace('white',0)
wine['type']=wine['type'].replace('red',1)

white wine 은 0으로 red wine은 1로 교체

 


정규화

X = wine.iloc[:,:-1]
y = wine.iloc[:,-1]

feature데이터와 label데이터로 분리

 

#정규화?
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
scaler.fit_transform(X[['fixed_acidity','residual_sugar','free_sulfur_dioxide',
                      'total_sulfur_dioxide','alcohol']])
X['fixed_acidity'] = scaler.fit_transform(X[['fixed_acidity']])
X['residual_sugar'] = scaler.fit_transform(X[['residual_sugar']])
X['free_sulfur_dioxide'] = scaler.fit_transform(X[['free_sulfur_dioxide']])
X['total_sulfur_dioxide'] = scaler.fit_transform(X[['total_sulfur_dioxide']])
X['alcohol'] = scaler.fit_transform(X[['alcohol']])
X['pH'] = scaler.fit_transform(X[['pH']])

언더샘플링

 

# 언더샘플링 라이브러리와 다른 라이브러리 
!pip install imblearn
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
#train/test 분리
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,
                                                    random_state=0)
#언더샘플링
undersample = RandomUnderSampler(sampling_strategy='majority')

#fit and apply the transform
X_train_under,y_train_under = undersample.fit_resample(X_train,y_train)
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score

model = SVC()
clf_under = model.fit(X_train_under,y_train_under)
pred_under = clf_under.predict(X_test)

print('ROC AUC score for undersampled data: ',roc_auc_score(y_test, pred_under))

결과는

ROC AUC score for undersampled data: 0.6959776796093932

 


오버샘플링

# smote 오버샘플링 라이브러리
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
smote = SMOTE(random_state=0)

X_train_smote, y_train_smote = smote.fit_resample(X_train,y_train)
# import SVM libraries 
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score

model=SVC()
clf_SMOTE = model.fit(X_train_smote, y_train_smote)
pred_SMOTE = clf_SMOTE.predict(X_test)

print("ROC AUC score for oversampled SMOTE data: ", roc_auc_score(y_test, pred_SMOTE))

출력결과

ROC AUC score for oversampled SMOTE data: 0.7459544292025111


decision Tree

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=10)
dt.fit(X_train_smote, y_train_smote)

y_pred = dt.predict(X_test)

from sklearn import metrics

accuracy = metrics.accuracy_score(y_test, y_pred)
print(accuracy)

0.8359979370809696

 

 

KNN

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(X_train_smote, y_train_smote)
y_pred = knn.predict(X_test)

# calculate classification accuracy
from sklearn import metrics

accuracy = metrics.accuracy_score(y_test, y_pred)
cm = metrics.confusion_matrix(y_test, y_pred)

print(accuracy)

0.8782877772047447

 

accuracy_set = []
cm_set = []
k_set = [1,3,5,7,9,11]

# for k in range(1,10):
for k in k_set:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train_smote, y_train_smote)
    y_pred = knn.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    cm = metrics.confusion_matrix(y_test, y_pred)
    accuracy_set.append(accuracy)
    cm_set.append(cm)
print(accuracy_set)

[0.9195461578133058, 0.8782877772047447, 0.8566271273852502, 0.8339350180505415, 0.8215575038679732, 0.8112429087158329]

 

print(cm_set[2])

[[ 35 34]

 [ 244 1626]]

 

weights_set = ['distance', 'uniform']
k_set = [i for i in range(1, 11)]
accuracy_set = []
cm_set = []
knn_set = []

for k in k_set:
    for weights in weights_set:
        print('k = %d, weights = %s' % (k, weights))
        knn = KNeighborsClassifier(n_neighbors=k, weights=weights)
        knn.fit(X_train_smote, y_train_smote)
        y_pred = knn.predict(X_test)
        accuracy = metrics.accuracy_score(y_test, y_pred)
        cm = metrics.confusion_matrix(y_test, y_pred)
        accuracy_set.append(accuracy)
        cm_set.append(cm)
        knn_set.append(knn)

k = 1, weights = distance

k = 1, weights = uniform

k = 2, weights = distance

k = 2, weights = uniform

k = 3, weights = distance

k = 3, weights = uniform

k = 4, weights = distance

k = 4, weights = uniform

k = 5, weights = distance

k = 5, weights = uniform

k = 6, weights = distance

k = 6, weights = uniform

k = 7, weights = distance

k = 7, weights = uniform

k = 8, weights = distance

k = 8, weights = uniform

k = 9, weights = distance

k = 9, weights = uniform

k = 10, weights = distance

k = 10, weights = uniform

 

from pprint import pprint
pprint(accuracy_set)

[0.9195461578133058,

0.9195461578133058,

0.9195461578133058,

0.8648788035069623,

0.8984012377514182,

0.8782877772047447,

0.9009798865394534,

0.8406395048994327,

0.88602372356885,

0.8566271273852502,

0.888602372356885,

0.8236204228984012,

0.8767405879319237,

0.8339350180505415,

0.8772563176895307,

0.8071170706549768,

0.8664259927797834,

0.8215575038679732,

0.8679731820526044,

0.7983496647756576]

'데이터 청년 캠퍼스(경남대학교) > 스터디' 카테고리의 다른 글

2021-07-15  (0) 2021.07.16
2021-07-14  (0) 2021.07.14
2021-07-13  (0) 2021.07.14
2021-07-12  (0) 2021.07.12
2021 - 07 - 07  (0) 2021.07.07