본문 바로가기
Computer Science/Machine Learning

Linear Regression : Logistic Regression [은하계 종류 예측] (5)

by BaekDaBang 2024. 3. 24.
# 필요한 라이브러리를 임포트
import random
import os

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

import warnings
warnings.filterwarnings(action='ignore')


# 랜덤시드 고정
seed = 42

random.seed(seed)
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)


# 데이터 폴더 (Input) 내 경로 확인
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

 

1. Visualization

from pathlib import Path
from PIL import Image
import matplotlib.pyplot as plt
%matplotlib inline

IMAGE_PATH = '/kaggle/input/galaxys'

galaxys = os.listdir(IMAGE_PATH)
title = [Path(galaxy).stem for galaxy in galaxys]

image_size = (300, 300)
images = list()

for galaxy in galaxys:
    image_path = os.path.join(IMAGE_PATH, galaxy)
    with Image.open(image_path) as image:
        image = image.resize(image_size)
        image_np = np.array(image)
        images.append(image_np)

# Show
fig, ax = plt.subplots(1, len(galaxys), figsize=(len(galaxys)*3, 3))
for i, image in enumerate(images):
    ax[i].imshow(image)
    ax[i].set_title(title[i])
    ax[i].axis('off')

 

2. Feature Extraction

import torch
import torchvision
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from sklearn.decomposition import PCA
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

warnings.filterwarnings(action='ignore')
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

model = torchvision.models.vgg16(pretrained=True)
model.classifier = nn.Sequential(*list(model.classifier.children())[:-3])

model = model.to(device)
def preprocess_image(base_path, image_path):
    image = Image.open(os.path.join(base_path, image_path)).convert('RGB')
    image = transforms.ToTensor()(image)
    image = transforms.Resize((224, 224))(image)
    image = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                  std=[0.229, 0.224, 0.225])(image)
    
    return image.unsqueeze(0)


def extract_features(base_path, image_path):
    image = preprocess_image(base_path, image_path)
    image = torch.as_tensor(image, device=device)
    with torch.no_grad():
        features = model(image)
    
    features = features.cpu().numpy().flatten()
    
    return features
galaxys = galaxys * 25

features = list()
for galaxy in tqdm(galaxys):
    features.append(extract_features(IMAGE_PATH, galaxy))
    
feature_df = pd.DataFrame(features)
display(feature_df.head())
# PCA: 주성분 분석. 주성분이란 전체 데이터(독립변수들)의 분산을 가장 잘 설명하는 성분
pca = PCA(n_components=64)
features_pca = pca.fit_transform(feature_df)
features_pca = pd.DataFrame(features_pca)

print(features_pca.shape)
display(features_pca.head())

 

3. Dataset

# 학습용 데이터 셋 (trainX, trainY)과 평가용 데이터 셋 (testX), 제출 포맷 (submit) 로드
trainX = pd.read_csv('/kaggle/input/2023-ml-w5p1/trainX.csv')
trainY = pd.read_csv('/kaggle/input/2023-ml-w5p1/trainY.csv')
testX  = pd.read_csv('/kaggle/input/2023-ml-w5p1/testX.csv')

submit = pd.read_csv('/kaggle/input/2023-ml-w5p1/submit.csv')
# VGG16을 활용하여 64차원의 1-Dim 벡터로 가공된 데이터 셋 (trainX) 확인
print(f'shape of train data: {trainX.shape}')	# shape of train data: (6000, 64)
trainX.head()
# 학습용 라벨 (label) 데이터 셋 (trainY)를 확인
display(trainY.head())

print(f'columns: {trainY.columns}')
print(f'labels : {trainY.Category.unique()}')

# 불필요한 열 (Id) drop
trainY.drop(['Id'], axis=1, inplace=True)

 

4. Preprocessing

print(type(trainY['Category']))			# <class 'pandas.core.series.Series'>
print(type(trainY['Category'].values))	# <class 'numpy.ndarray'>
# 학습용 라벨 ('edge','smooth','spiral')을 정수로 변환 (LabelEncoder)
le = LabelEncoder()
# Got an error: "ValueError: y should be a 1d array, got an array of shape (6000, 2) instead." when transform trainY['Category']
# labels = le.fit_transform(trainY['Category'].values)
labels = pd.DataFrame(le.fit_transform(trainY['Category'].values))
labels = labels.values.ravel() # labels.squeeze()
# 학습용 데이터를 토대로 검증하고자, 학습용 데이터를 다시 검증을 위한 학습 데이터와 검증 데이터로 분리 (최종 모델 최적화에는 전체 학습용 데이터 (trainX)를 사용해야 함)
x_train, x_val, y_train, y_val = train_test_split(trainX, labels, test_size=0.2, shuffle=True, random_state=seed, stratify=labels)

print(f'shape of train data, x_train: {x_train.shape}, y_train: {y_train.shape}')
print(f'shape of test  data, x_val  : {x_val.shape}, y_val  : {y_val.shape}')

 

5. Model Learning (Logistic Regression) & Evaluation : Validation Data

# 학습 모델을 정의
# Logistic Regreesion: 독립 변수의 선형 결합을 이용하여 사건의 발생 가능성을 예측하는 데 사용되는 통계 기법
clf = LogisticRegression(random_state=seed)
# clf = LogisticRegression(solver='liblinear', random_state=seed)

# 학습 데이터 (x_train)에 따라 최적화
clf.fit(x_train, y_train)

# 검증용 데이터 (x_val)에 대해 예측
pred_val = clf.predict(x_val)
# 검증용 데이터 라벨 (y_test)로 모델 성능의 경향성 살펴보기
print(f'accuracy for validation data accuracy: {accuracy_score(y_val, pred_val)}')

print('confusion matrix:')
print(confusion_matrix(y_val, pred_val))

decoded_labels = le.inverse_transform(labels)

for encoded_label, decoded_label in zip(set(labels), set(decoded_labels)):
    print(f"Label: {decoded_label:>6} -> Encoding: {encoded_label}")

from sklearn.preprocessing import StandardScaler

scaling = True

if scaling:
    scaler = StandardScaler()
    x_train_scale = scaler.fit_transform(x_train)
    x_val_scale = scaler.transform(x_val)

# 학습 모델을 정의
clf = LogisticRegression(random_state=seed)

# 학습 데이터 (x_train)에 따라 최적화
clf.fit(x_train_scale, y_train)

# 검증용 데이터 (x_val)에 대해 예측
pred_val = clf.predict(x_val_scale)
# 검증용 데이터 라벨 (y_test)로 모델 성능의 경향성 살펴보기
print(f'accuracy for validation data accuracy: {accuracy_score(y_val, pred_val)}')

print('confusion matrix:')
print(confusion_matrix(y_val, pred_val))

 

6. Model Learning (Logistic Regression) & Prediction: Train Data

# 전체 데이터 (trainX)에 대해 최적화
clf = LogisticRegression(max_iter=1000, random_state=seed)

clf.fit(trainX, labels)

# 평가용 데이터 (testX)에 대해 예측
pred_result = clf.predict(testX)

# 예측 데이터 확인
pred_result		# array([0, 1, 2, ..., 0, 2, 1])
# 현재 예측 데이터는 labelEncoder를 통해 정수로 변환되어 있으니, 다시 문자열로 변환 후 확인
pred_result = le.inverse_transform(pred_result)

pred_result		# array(['edge', 'smooth', 'spiral', ..., 'edge', 'spiral', 'smooth'], dtype=object)

 

7. Submit

# 제출 데이터 포맷 확인
submit.head()

# 제출 포맷에 맞춰 예측 데이터를 삽입 후 저장
submit['Category'] = pred_result

submit.to_csv('Baseline.csv', index=False)
# 제출 데이터 확인
submit.head()