ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • Genetic Algorithm(feature selection) + LightGBM
    Base Line/python 기초 코드 2022. 8. 25. 17:45

    from __future__ import print_function
    import numpy as num
    from sklearn import datasets, linear_model
    from catboost import CatBoostRegressor, Pool
    from genetic_selection import GeneticSelectionCV
    import numpy as np
    import pandas as pd
    import os
    import math
    import random
    from tqdm import tqdm
    from scipy.stats import skew,kurtosis
    import matplotlib.pyplot as plt
    import seaborn as sns
    import warnings
    import time
    import itertools
    import statsmodels.api as sm
    from joblib import Parallel,delayed
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.cluster import KMeans
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import RepeatedKFold, cross_val_score, StratifiedKFold,KFold
    from sklearn.ensemble import RandomForestClassifier
    from catboost import CatBoostRegressor, Pool
    warnings.filterwarnings(action='ignore')
    from catboost import CatBoostClassifier
    import lightgbm as lgb
    
    SEED = 42
    def seed_everything(seed):
        random.seed(seed)
        os.environ['PYTHONHASHEED'] = str(seed)
        np.random.seed(seed)
    seed_everything(SEED)#시드고정
    #print('Feature Selection:', x.columns[selectors.support_])
    #print(selectors.support_)
    #디렉토리 설정
    data_dir = './input/lg-open'
    #디렉토리 안에 있는거 리스트로 만들기
    list1 = os.listdir(data_dir)
    
    for file in tqdm(list1):
        tmp = file.split('.')[0]
        if file == 'meta':
            list2 = os.listdir(data_dir+f'/{file}')
            for file2 in list2:
                tmp2 = file2.split('.')[0]
                print(tmp2)
                globals()[f'{tmp2}'] = pd.read_csv(data_dir + f'/meta/{file2}')
        else:
            print(tmp)
            globals()[f'{tmp}'] = pd.read_csv(data_dir+f'/{file}')
    def lg_nrmse(gt, preds):
        # 각 Y Feature별 NRMSE 총합
        # Y_01 ~ Y_08 까지 20% 가중치 부여
        all_nrmse = []
        for idx in range(0,14): # ignore 'ID'
            rmse = mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
            nrmse = rmse/np.mean(np.abs(gt[:,idx]))
            all_nrmse.append(nrmse)
        score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
        return score
    train_x = train.filter(regex='X')
    train_y = train.filter(regex='Y')
    test_x = test.filter(regex='X')
    from sklearn_genetic import GASearchCV
    from sklearn_genetic.space import Integer, Categorical, Continuous
    from sklearn_genetic.plots import plot_fitness_evolution, plot_search_space
    from sklearn_genetic.callbacks import LogbookSaver, ProgressBar
    from sklearn.model_selection import train_test_split, KFold
    from sklearn.metrics import r2_score
    #data load완료
    #lightgbm
    estimators = lgb.LGBMRegressor()
    
    predictions = []
    predictions_ = []
    
    models = GeneticSelectionCV(
        estimators, cv=10, verbose=0,
        scoring="r2", max_features=5,
        n_population=100, crossover_proba=0.5,
        mutation_proba=0.2, n_generations=50,
        crossover_independent_proba=0.5,
        mutation_independent_proba=0.04,
        tournament_size=3, n_gen_no_change=10,
        caching=True, n_jobs=-1)
    
    
    for j in range(1,15):
        if j <10:
            print("i wiil go")
            print(f'Y_0{j}')
            train_y_ = train_y[f'Y_0{j}']
            models = models.fit(train_x, train_y_)
            prediction = models.predict(test_x)
            predictions_.append(prediction)
            print("go")
        
        else:
            print("i wiil going")
            print(f'Y_{j}')
            train_y_ = train_y[f'Y_{j}']
            models.fit(train_x, train_y_)
            prediction = models.predict(test_x)
            predictions_.append(prediction)
            print("going")
            
    predictions.append(predictions_)     
    print(predictions)
    
    
    #prediction = models.predict(test_x)
    #print('Feature Selection:', x.columns[models.support_])
    final_prediction = np.mean(np.array(predictions), axis=0).T
    
    for idx, col in enumerate(sample_submission.columns):
        if col=='ID':
            continue
        sample_submission[col] = final_prediction[:,idx-1]
    print('Done.')
    sample_submission.to_csv('submission_GA_lightgbm.csv', index = False)

    데이터에 따라 성능이 갈리는 것 같다.



    reference

    'Base Line > python 기초 코드' 카테고리의 다른 글

    리스트 딥카피  (0) 2023.08.24
    sobel, canny, laplacian filter  (1) 2022.12.03
    Auto encoder  (0) 2022.08.23
    sklearn 회귀관련 모델 정리  (0) 2022.08.16
    torch check list  (0) 2022.08.14
Designed by Tistory.