본문 바로가기
728x90

Data Analytics with python114

[Pandas][DataFrame][concat]S3_02_concatenation_with_multi_indexing In [1]: import pandas as pd In [2]: raw_data = {'Bank Client ID': ['1', '2', '3', '4', '5'], 'First Name': ['Robert', 'Benedict', 'Mark', 'Tom', 'Ryan'], 'Last Name': ['Downey', 'Cumberbatch', 'Ruffalo', 'Holland', 'Reynolds']} bank1_df = pd.DataFrame(raw_data, columns = ['Bank Client ID', 'First Name', 'Last Name']) In [3]: raw_data = {'Bank Client ID': ['6', '7', '8', '9', '10'], 'First Name':.. 2023. 1. 21.
[Pandas][DataFrame][concat]S3_01_concatenation In [1]: import pandas as pd https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html In [2]: raw_data = {'Bank Client ID': ['1', '2', '3', '4', '5'], 'First Name': ['Robert', 'Benedict', 'Mark', 'Tom', 'Ryan'], 'Last Name': ['Downey', 'Cumberbatch', 'Ruffalo', 'Holland', 'Reynolds']} bank1_df = pd.DataFrame(raw_data, columns = ['Bank Client ID', 'First Name', 'Last Name']) bank1_df O.. 2023. 1. 21.
[Pandas][DataFrame]S2_14_change_datatypes In [59]: import pandas as pd In [60]: employee_df = pd.read_csv('Human_Resources_Employee.csv') employee_df.head() Out[60]: Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole Yea.. 2023. 1. 21.
[Pandas][DataFrame]S2_13_FeatureEngineering Feature Engineering and Dealing with Missing Dataset¶ https://www.kaggle.com/datasets/rishikeshkonapure/hr-analytics-prediction?resource=download 위 데이터의 일부를 Null 처리함 In [46]: import pandas as pd In [48]: employee_df = pd.read_csv('Human_Resources_Employee.csv') employee_df.head() Out[48]: Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount Em.. 2023. 1. 21.
[Pandas][DataFrame]S2_12_Operations_Filtering In [1]: import pandas as pd In [2]: bank_df = pd.read_csv('bank customers.csv') bank_df Out[2]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited 0 1 15634602 Hargrave 619 France Female 42 2 0.00 1 1 1 101348.88 1 1 2 15647311 Hill 608 Spain Female 41 1 83807.86 1 0 1 112542.58 0 2 3 15619304 Onio 502 France.. 2023. 1. 21.
[Pandas][DataFrame]S2_11_define_functions In [1]: import pandas as pd In [2]: bank_df = pd.read_csv('bank customers.csv') bank_df Out[2]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited 0 1 15634602 Hargrave 619 France Female 42 2 0.00 1 1 1 101348.88 1 1 2 15647311 Hill 608 Spain Female 41 1 83807.86 1 0 1 112542.58 0 2 3 15619304 Onio 502 France.. 2023. 1. 21.
[Pandas][DataFrame]S2_10_sorting_and_ordering In [1]: import pandas as pd In [2]: bank_df = pd.read_csv('bank customers.csv') bank_df Out[2]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited 0 1 15634602 Hargrave 619 France Female 42 2 0.00 1 1 1 101348.88 1 1 2 15647311 Hill 608 Spain Female 41 1 83807.86 1 0 1 112542.58 0 2 3 15619304 Onio 502 France.. 2023. 1. 21.
[Pandas][DataFrame]S2_09_Broadcasting In [1]: import pandas as pd In [2]: bank_df = pd.read_csv('bank customers.csv') bank_df Out[2]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited 0 1 15634602 Hargrave 619 France Female 42 2 0.00 1 1 1 101348.88 1 1 2 15647311 Hill 608 Spain Female 41 1 83807.86 1 0 1 112542.58 0 2 3 15619304 Onio 502 France.. 2023. 1. 21.
[Pandas][DataFrame]S2_08_integer_index_Based_elements_selection In [23]: import pandas as pd In [25]: bank_df = pd.read_csv('bank customers.csv') bank_df Out[25]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited 0 1 15634602 Hargrave 619 France Female 42 2 0.00 1 1 1 101348.88 1 1 2 15647311 Hill 608 Spain Female 41 1 83807.86 1 0 1 112542.58 0 2 3 15619304 Onio 502 Fra.. 2023. 1. 21.
[Pandas][DataFrame]S2_07_Label_Based_elements_selection In [1]: import pandas as pd In [3]: bank_df = pd.read_csv('bank customers.csv', index_col='Surname') bank_df Out[3]: RowNumber CustomerId CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited Surname Hargrave 1 15634602 619 France Female 42 2 0.00 1 1 1 101348.88 1 Hill 2 15647311 608 Spain Female 41 1 83807.86 1 0 1 112542.58 0 Onio 3 1561.. 2023. 1. 21.
[Pandas][DataFrame]S2_06_Column_ADDING_DELETING In [1]: import pandas as pd In [2]: bank_df = pd.read_csv('bank customers.csv') bank_df Out[2]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited 0 1 15634602 Hargrave 619 France Female 42 2 0.00 1 1 1 101348.88 1 1 2 15647311 Hill 608 Spain Female 41 1 83807.86 1 0 1 112542.58 0 2 3 15619304 Onio 502 France.. 2023. 1. 21.
[Pandas][DataFrame]S2_05_selecting_columns 데이터 출처: https://www.kaggle.com/datasets/mathchi/churn-for-bank-customers?resource=download In [1]: import pandas as pd In [3]: bank_df = pd.read_csv('bank customers.csv'); bank_df Out[3]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited 0 1 15634602 Hargrave 619 France Female 42 2 0.00 1 1 1 101348.88 1 1 2.. 2023. 1. 21.
[Pandas][DataFrame]S2_04_index_setting 데이터 출처: https://www.kaggle.com/datasets/mathchi/churn-for-bank-customers?resource=download In [1]: import pandas as pd In [7]: bank_df = pd.read_csv('bank customers.csv'); bank_df Out[7]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited 0 1 15634602 Hargrave 619 France Female 42 2 0.00 1 1 1 101348.88 1 1 2.. 2023. 1. 21.
[Pandas][DataFrame]S2_03_Outputs In [1]: import pandas as pd In [2]: bank_df = pd.read_csv('bank customers.csv'); bank_df Out[2]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited 0 1 15634602 Hargrave 619 France Female 42 2 0.00 1 1 1 101348.88 1 1 2 15647311 Hill 608 Spain Female 41 1 83807.86 1 0 1 112542.58 0 2 3 15619304 Onio 502 Franc.. 2023. 1. 21.
[Pandas][DataFrame]S2_02_Inputs 데이터 출처: https://www.kaggle.com/datasets/mathchi/churn-for-bank-customers In [1]: import pandas as pd In [2]: bank_df = pd.read_csv('bank customers.csv'); bank_df Out[2]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited 0 1 15634602 Hargrave 619 France Female 42 2 0.00 1 1 1 101348.88 1 1 2 15647311 Hill 608.. 2023. 1. 21.
[Pandas][DaraFrame]S2_01_DataFrame In [1]: import pandas as pd In [3]: # data client_df = pd.DataFrame({'Client ID':[111, 112, 113, 114], 'Client Name':['Michael','Donald','John','Matthew'], 'Net Worth[$]': [3000, 40000, 100000, 15000], 'Years': [5, 9, 10, 12]}) client_df Out[3]: Client ID Client Name Net Worth[$] Years 0 111 Michael 3000 5 1 112 Donald 40000 9 2 113 John 100000 10 3 114 Matthew 15000 12 In [4]: # the data type t.. 2023. 1. 17.
[Pandas][Series]S1_12_Slicing In [2]: import pandas as pd In [3]: prices = pd.read_csv('/content/sample_data/prices.csv', squeeze=True); prices Out[3]: 0 2.55 1 3.39 2 2.75 3 3.39 4 3.39 ... 541905 2.10 541906 4.15 541907 4.15 541908 4.95 541909 18.00 Name: Price, Length: 541910, dtype: float64 In [4]: # Slice elements from a Pandas Series # starting from index 0 up until and not including stop index prices[0:5] Out[4]: 0 2... 2023. 1. 17.
[Pandas][Series]S1_11_Indexing In [1]: import pandas as pd In [6]: prices = pd.read_csv('/content/sample_data/prices.csv', squeeze=True); prices Out[6]: 0 2.55 1 3.39 2 2.75 3 3.39 4 3.39 ... 541905 2.10 541906 4.15 541907 4.15 541908 4.95 541909 18.00 Name: Price, Length: 541910, dtype: float64 In [3]: # the first element in a Pandas Series # ★ index starts from zero! prices[0] Out[3]: 2.55 In [12]: # the fifth element in a .. 2023. 1. 17.
[pandas][Series]S1_10_Checking element In [1]: import pandas as pd In [2]: prices = pd.read_csv('/content/sample_data/prices.csv', squeeze=True) In [6]: # Check if a given number exists in a pandas Series values 5.79 in prices.values Out[6]: True In [7]: # Check if a given number exists in a pandas Series index 5.79 in prices.index Out[7]: False In [8]: # 'in' will search in pandas index by default 5.79 in prices Out[8]: False 2023. 1. 17.
[pandas][Series] S1_09_Math Operations In [ ]: import pandas as pd In [ ]: prices = pd.read_csv('/content/sample_data/prices.csv', squeeze=True) In [ ]: # Apply Sum prices.sum() Out[ ]: 2498821.9739999995 In [ ]: # Apply count prices.count() Out[ ]: 541910 In [ ]: # the maximum value prices.max() Out[ ]: 38970.0 In [ ]: # the minimum value prices.min() Out[ ]: -11062.06 In [ ]: # all statistical information prices.describe() Out[ ]: .. 2023. 1. 17.
728x90