ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • pandas - 1
    Base Line/python 기초 코드 2022. 7. 1. 22:04
    from pandas import Series, DataFrame
    import pandas as pd
    
    
    list_data = [1,2,3,4,5]
    example_obj = Series(data = list_data)
    example_obj
    
    
    list_data = [1,2,3,4,5]
    list_name = ["a","b","c","d","e"]
    example_obj = Series(data = list_data, index = list_name)
    example_obj
    
    
    
    import numpy as np
    
    
    dict_data = {"a":1,"b":2,"c":3,"d":4,"e":5}
    example_obj = Series(dict_data,dtype=np.float32,name="example_data")
    example_obj
    example_obj["a"]
    example_obj["a"] = 3.2
    example_obj
    
    
    
    example_obj.values
    example_obj.index
    example_obj.name = "number"
    example_obj.index.name = "alphabet"
    example_obj
    
    
    dict_data_1 = {"a":1,"b":2,"c":3,"d":4,"e":5}
    indexes = ["a","b","c","d","e","f","g","h"]
    series_obj_1 = Series(dict_data_1,index = indexes)
    series_obj_1
    
    
    
    # dataframe : Series를 모아서 만든 Data Table = 기본 2차원
    
    from pandas import Series, DataFrame
    import pandas as pd
    import numpy as np

     

    from pandas import Series, DataFrame
    import pandas as pd
    import numpy as np
    raw_data = {'first_name' : ['Jason','Molly','Tina','Jake','Any'],
                'last_name':['Miler','Jacobson','Ali','Milner','Cooze'],
                'age':[42,52,36,24,73],
                'city':['San Francisco','Baltimore','Miami','Douglas','Boston']}
    df = pd.DataFrame(raw_data,columns = ['first_name','last_name','age','city'])
    df
    
    
    
    df = DataFrame(raw_data,columns=["first_name","last_name","age","city","debt"])
    df.first_name
    
    
    df["first_name"]
    
    df
    
    
    #index location
    
    df.loc[1]
    
    
    #index position
    
    df["age"].iloc[1:]
    
    
    s= pd.Series(np.nan,index = [49,48,47,46,45,1,2,3,4,5])
    s.loc[:3]
    
    s.iloc[:3]
    df.debt = df.age>40
    df
    
    df.T
    df.values
    df.to_csv()
    del df["debt"]
    df
    df[:3]
    df
    
    import pandas as pd
    import numpy as np
    url = 'exceltasks/excel-comp-data.xlsx'
    df = pd.read_excel(url)
    df.head()
    df["account"][:3]
    
    account_series = df["account"]
    account_series[:3]
    
    account_series[[0,1,2]]
    account_series[account_series<250000]
    
    df.index = df["account"]
    del df["account"]
    df.head()
    
    df[["name","street"]][:2]
    df.iloc[:,:2]
    df.loc[[211829,320563],["name","street"]]
    df.index = list(range(0,15))
    df.head()
    df.drop(1)
    df.drop([0,1,2,3])
    df.drop("city",axis=1)  #df.drop(["city","state"],axis=1)
    from pandas import Series
    
    
    s1 = Series(range(1,6),index = list("abcde"))
    s1
    s2 = Series(range(5,11),index = list("bcedef"))
    s2
    s1.add(s2)
    s1+s2
    from pandas import DataFrame
    
    df1 = DataFrame(np.arange(9).reshape(3,3),columns = list("abc"))
    df1
    df2 = DataFrame(np.arange(16).reshape(4,4),columns = list("abcd"))
    df2
    df1+df2
    df1.add(df2,fill_value=0)
    df = DataFrame(np.arange(16).reshape(4,4),columns = list("abcd"))
    df
    s2 = Series(np.arange(10,14))
    s2
    df+s2
    df.add(s2,axis=0)
    s1 = Series(np.arange(10))
    s1.head(5)
    s1.map(lambda x :x**2).head(5)
    z={1:'A',2:'B',3:'C'}
    s1.map(z).head(5)
    
    s2 = Series(np.arange(10,20))
    s1.map(s2).head(5)
    
    df = pd.read_csv("wages.csv")
    df.head()
    
    df.sex.unique()
    
    df["sex_code"] = df.sex.map({"male":0,"female":1}).head()
    
    df.sex.replace(["male","female"],[0,1],inplace = True)
    df.head(5)
    
    df_info = df[["earn","height","age"]]
    df_info.head()
    
    f = lambda x : x.max() - x.min()
    df_info.apply(f)
    
    df_info.sum()
    df_info.apply(sum)
    f = lambda x : -x
    df_info.applymap(f).head(5)
    f = lambda x : -x
    df_info["earn"].apply(f).head(5)
    
    df = pd.read_csv("wages.csv")
    df.head()
    df.describe()
    
    df.race.unique()
    np.array(dict(enumerate(df["race"].unique())))
    value = list(map(int,np.array(list(enumerate(df["race"].unique())))[:,0].tolist()))
    key = np.array(list(enumerate(df["race"].unique())),dtype=str)[:,1].tolist()
    value,key
    df["race"].replace(to_replace=key,value = value,inplace = True)
    df.head(5)
    #column
    df.sum(axis = 0)
    #row
    df.sum(axis = 1)
    df.isnull()
    df.isnull().sum()
    df.sort_values(["age","earn"],ascending=True).head(10)
    df.age.corr(df.earn)
    df.age.cov(df.earn)
    df.corrwith(df.earn)
    df.corr()
    # data 
    import pandas as pd 
    
    ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
             'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
             'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
             'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
             'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
    
    df = pd.DataFrame(ipl_data)
    df
    
    df.groupby("Team")["Points"].sum()
    
    df.groupby(["Team","Year"])["Points"].sum()
    
    df
    h_index = df.groupby(["Team","Year"])["Points"].sum()
    h_index
    h_index.index
    h_index["Devils":"Kings"]
    
    h_index.unstack()
    h_index.swaplevel()
    h_index.swaplevel().sort_index(level=0)
    h_index.sum(level=0)
    
    h_index.sum(level=1)
    grouped = df.groupby("Team")
    
    for name, group in grouped:
        print(name)
        print(group)
        
        
    grouped.get_group("Devils")
    grouped.agg(sum)
    
    
    import numpy as np
    grouped.agg(np.mean)
    grouped['Points'].agg([np.sum, np.mean,np.std])
    df
    score = lambda x:(x)
    grouped.transform(score)
    df
    score = lambda x : (x.max())
    grouped.transform(score)
    df
    
    score = lambda x : (x - x.mean())/x.std()
    grouped.transform(score)
    df.groupby('Team').filter(lambda x: len(x) >=3)
    
    import dateutil
    import pandas as pd
    df_phone = pd.read_csv("phone_data.csv")
    df_phone['date'] = df_phone['date']
    
    df_phone.groupby('month')['duration'].sum()
    df_phone[df_phone['item']=='call'].groupby('network')['duration'].sum()
    df_phone.groupby(['month','item'])['date'].count()
    
    df_phone.groupby(['month','item'])['date'].count().unstack()
    df_phone.groupby('month',as_index=False).agg({"duration":"sum"})
    df_phone.groupby(['month','item']).agg({'duration':sum,
                                        'network_type':"count",
                                        'date':'first'})
                                        
                                        
    df_phone.groupby(['month','item']).agg({'duration':[min,max,sum],
                                            'network_type':"count",
                                            'date':[min,'first','nunique']})
                                            
                                            
    import numpy as np
    grouped = df_phone.groupby('month').agg({"duration":[min,max,np.mean]})
    grouped.columns = grouped.columns.droplevel(level=0)
    grouped.rename(columns={"min":"min_duration","max":"max_duration","mean":"mean_duration"})
    
    df_phone.pivot_table(["duration"],
                        index = [df_phone.month,df_phone.item],
                        columns = df_phone.network, aggfunc="sum",fill_value=0)
    
    
    df_movie = pd.read_csv("movie_rating.csv")
    df_movie.head()
    
      
    pd.crosstab(index=df_movie.critic,columns=df_movie.title,values = df_movie.rating,aggfunc="first").fillna(0)
    
    df_movie.pivot_table(["rating"],index = df_movie.critic,columns = df_movie.title, aggfunc="sum",fill_value=0)
    
    
    pd.merge(df_a,df_b,left_on = 'subject_id',right_on='subject_id')
    pd.merge(df_a,df_b,on='subject_id',how = 'left')
    pd.merge(df_a,df_b,on = 'subject_id',how = 'right')
    pd.merge(df_a,df_b,on'subject_id',how = 'outer')
    pd.merge(df_a,df_b,on = 'subject_id',how = 'inner')
    pd.merge(df_a,df_b,right_index=True,left_index=True)
    df_new = pd.concat([df_a,df_b])
    df_new.reset_index()
    df_a.append(df_b)
    df_new = pd.concat([df_a,df_b],axis=1)
    df_new.reset_index()
    import sqlite3
    conn = sqlite3.connext("./data/flights.db")
    cur = conn.cursor()
    cur.execute("select*from airlines limit 5;")
    results = cur.fetchall()
    results
    df_airplines = pd.read_sql_query("select*from airlines;",conn)
    df_airports = pd.read_sql_query("select*from airports;",conn)
    df_routes = pd.read_sql_query("select * from routes;",conn)
    
    
    df_routes.to_pickle("./data/df_routes.pickle")
    df_routes_pickle = pd.read_pickle("./data/df_routes.pickle")
    df_routes_pickle.head()
    writer = pd.ExcelWriter('./data/df_routes.xlsx',engine = 'xlsxwriter')
    df_routes.to_excel(writer,sheet_name = 'Sheet1')

    reference : https://github.com/TeamLab/introduction_to_python_TEAMLAB_MOOC

    'Base Line > python 기초 코드' 카테고리의 다른 글

    torch check list  (0) 2022.08.14
    pandas - 2  (0) 2022.07.12
    blender python script  (0) 2022.06.07
    pcl + open3d  (0) 2022.06.06
    pillow 이미지 rotation, flip  (1) 2022.04.18
Designed by Tistory.