-
Genetic Algorithm(feature selection) + LightGBMBase Line/python 기초 코드 2022. 8. 25. 17:45
from __future__ import print_function import numpy as num from sklearn import datasets, linear_model from catboost import CatBoostRegressor, Pool from genetic_selection import GeneticSelectionCV import numpy as np import pandas as pd import os import math import random from tqdm import tqdm from scipy.stats import skew,kurtosis import matplotlib.pyplot as plt import seaborn as sns import warnings import time import itertools import statsmodels.api as sm from joblib import Parallel,delayed from sklearn.ensemble import RandomForestRegressor from sklearn.cluster import KMeans from sklearn.metrics import mean_squared_error from sklearn.model_selection import RepeatedKFold, cross_val_score, StratifiedKFold,KFold from sklearn.ensemble import RandomForestClassifier from catboost import CatBoostRegressor, Pool warnings.filterwarnings(action='ignore') from catboost import CatBoostClassifier import lightgbm as lgb SEED = 42 def seed_everything(seed): random.seed(seed) os.environ['PYTHONHASHEED'] = str(seed) np.random.seed(seed) seed_everything(SEED)#시드고정 #print('Feature Selection:', x.columns[selectors.support_]) #print(selectors.support_)
#디렉토리 설정 data_dir = './input/lg-open' #디렉토리 안에 있는거 리스트로 만들기 list1 = os.listdir(data_dir) for file in tqdm(list1): tmp = file.split('.')[0] if file == 'meta': list2 = os.listdir(data_dir+f'/{file}') for file2 in list2: tmp2 = file2.split('.')[0] print(tmp2) globals()[f'{tmp2}'] = pd.read_csv(data_dir + f'/meta/{file2}') else: print(tmp) globals()[f'{tmp}'] = pd.read_csv(data_dir+f'/{file}')
def lg_nrmse(gt, preds): # 각 Y Feature별 NRMSE 총합 # Y_01 ~ Y_08 까지 20% 가중치 부여 all_nrmse = [] for idx in range(0,14): # ignore 'ID' rmse = mean_squared_error(gt[:,idx], preds[:,idx], squared=False) nrmse = rmse/np.mean(np.abs(gt[:,idx])) all_nrmse.append(nrmse) score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15]) return score
train_x = train.filter(regex='X') train_y = train.filter(regex='Y') test_x = test.filter(regex='X')
from sklearn_genetic import GASearchCV from sklearn_genetic.space import Integer, Categorical, Continuous from sklearn_genetic.plots import plot_fitness_evolution, plot_search_space from sklearn_genetic.callbacks import LogbookSaver, ProgressBar from sklearn.model_selection import train_test_split, KFold from sklearn.metrics import r2_score
#data load완료 #lightgbm estimators = lgb.LGBMRegressor() predictions = [] predictions_ = [] models = GeneticSelectionCV( estimators, cv=10, verbose=0, scoring="r2", max_features=5, n_population=100, crossover_proba=0.5, mutation_proba=0.2, n_generations=50, crossover_independent_proba=0.5, mutation_independent_proba=0.04, tournament_size=3, n_gen_no_change=10, caching=True, n_jobs=-1) for j in range(1,15): if j <10: print("i wiil go") print(f'Y_0{j}') train_y_ = train_y[f'Y_0{j}'] models = models.fit(train_x, train_y_) prediction = models.predict(test_x) predictions_.append(prediction) print("go") else: print("i wiil going") print(f'Y_{j}') train_y_ = train_y[f'Y_{j}'] models.fit(train_x, train_y_) prediction = models.predict(test_x) predictions_.append(prediction) print("going") predictions.append(predictions_) print(predictions) #prediction = models.predict(test_x) #print('Feature Selection:', x.columns[models.support_])
final_prediction = np.mean(np.array(predictions), axis=0).T for idx, col in enumerate(sample_submission.columns): if col=='ID': continue sample_submission[col] = final_prediction[:,idx-1] print('Done.')
sample_submission.to_csv('submission_GA_lightgbm.csv', index = False)
데이터에 따라 성능이 갈리는 것 같다.
reference
'Base Line > python 기초 코드' 카테고리의 다른 글
리스트 딥카피 (0) 2023.08.24 sobel, canny, laplacian filter (1) 2022.12.03 Auto encoder (0) 2022.08.23 sklearn 회귀관련 모델 정리 (0) 2022.08.16 torch check list (0) 2022.08.14