728x90
In [1]:
import pandas as pd
In [3]:
bank_df = pd.read_csv('bank customers.csv', index_col='Surname')
bank_df
Out[3]:
RowNumber | CustomerId | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Surname | |||||||||||||
Hargrave | 1 | 15634602 | 619 | France | Female | 42 | 2 | 0.00 | 1 | 1 | 1 | 101348.88 | 1 |
Hill | 2 | 15647311 | 608 | Spain | Female | 41 | 1 | 83807.86 | 1 | 0 | 1 | 112542.58 | 0 |
Onio | 3 | 15619304 | 502 | France | Female | 42 | 8 | 159660.80 | 3 | 1 | 0 | 113931.57 | 1 |
Boni | 4 | 15701354 | 699 | France | Female | 39 | 1 | 0.00 | 2 | 0 | 0 | 93826.63 | 0 |
Mitchell | 5 | 15737888 | 850 | Spain | Female | 43 | 2 | 125510.82 | 1 | 1 | 1 | 79084.10 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
Obijiaku | 9996 | 15606229 | 771 | France | Male | 39 | 5 | 0.00 | 2 | 1 | 0 | 96270.64 | 0 |
Johnstone | 9997 | 15569892 | 516 | France | Male | 35 | 10 | 57369.61 | 1 | 1 | 1 | 101699.77 | 0 |
Liu | 9998 | 15584532 | 709 | France | Female | 36 | 7 | 0.00 | 1 | 0 | 1 | 42085.58 | 1 |
Sabbatini | 9999 | 15682355 | 772 | Germany | Male | 42 | 3 | 75075.31 | 2 | 1 | 0 | 92888.52 | 1 |
Walker | 10000 | 15628319 | 792 | France | Female | 28 | 4 | 130142.79 | 1 | 1 | 0 | 38190.78 | 0 |
10000 rows × 13 columns
In [4]:
# Sort the dataframe in an alphabetical order
bank_df.sort_index(inplace=True); bank_df
Out[4]:
RowNumber | CustomerId | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Surname | |||||||||||||
Abazu | 3272 | 15708791 | 584 | Spain | Male | 32 | 9 | 85534.83 | 1 | 0 | 0 | 169137.24 | 0 |
Abazu | 5110 | 15576156 | 710 | Spain | Female | 28 | 6 | 0.00 | 1 | 1 | 0 | 48426.98 | 0 |
Abbie | 842 | 15737792 | 818 | France | Female | 31 | 1 | 186796.37 | 1 | 0 | 0 | 178252.63 | 0 |
Abbott | 2989 | 15684801 | 689 | France | Male | 47 | 1 | 93871.95 | 3 | 1 | 0 | 156878.42 | 1 |
Abbott | 4574 | 15693906 | 645 | France | Female | 24 | 3 | 34547.82 | 1 | 1 | 1 | 11638.17 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
Zuev | 1031 | 15681174 | 730 | France | Male | 39 | 1 | 116537.60 | 1 | 0 | 0 | 145679.60 | 0 |
Zuyev | 587 | 15722010 | 621 | Spain | Male | 53 | 9 | 170491.84 | 1 | 1 | 0 | 35588.07 | 1 |
Zuyev | 6974 | 15583394 | 659 | Germany | Male | 39 | 8 | 106259.63 | 2 | 1 | 1 | 198103.32 | 0 |
Zuyeva | 6386 | 15706810 | 606 | Germany | Female | 32 | 1 | 106301.85 | 2 | 0 | 1 | 59061.25 | 0 |
Zuyeva | 7740 | 15689952 | 724 | Spain | Male | 41 | 5 | 0.00 | 1 | 0 | 1 | 115753.94 | 0 |
10000 rows × 13 columns
In [5]:
# loc is used to filter rows and columns
# loc is label-based meaning (by giving a name of rows, columns)
# rows
bank_df.loc['Zuyev']
Out[5]:
RowNumber | CustomerId | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Surname | |||||||||||||
Zuyev | 587 | 15722010 | 621 | Spain | Male | 53 | 9 | 170491.84 | 1 | 1 | 0 | 35588.07 | 1 |
Zuyev | 6974 | 15583394 | 659 | Germany | Male | 39 | 8 | 106259.63 | 2 | 1 | 1 | 198103.32 | 0 |
In [13]:
# select multiplle rows using "colon :"
# ★ this is inclusive meaning that both("Abbie" and "Abbott") were selected
# this will be different if we use integer based index as iloc[]
bank_df.loc["Abbie":"Abbott"]
Out[13]:
RowNumber | CustomerId | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Surname | |||||||||||||
Abbie | 842 | 15737792 | 818 | France | Female | 31 | 1 | 186796.37 | 1 | 0 | 0 | 178252.63 | 0 |
Abbott | 2989 | 15684801 | 689 | France | Male | 47 | 1 | 93871.95 | 3 | 1 | 0 | 156878.42 | 1 |
Abbott | 4574 | 15693906 | 645 | France | Female | 24 | 3 | 34547.82 | 1 | 1 | 1 | 11638.17 | 0 |
Abbott | 6839 | 15680804 | 850 | France | Male | 29 | 6 | 0.00 | 2 | 1 | 1 | 10672.54 | 0 |
Abbott | 2538 | 15723706 | 573 | France | Female | 33 | 0 | 90124.64 | 1 | 1 | 0 | 137476.71 | 0 |
In [14]:
# Select all elements up to and including 'Abbott' index
bank_df.loc[:"Abbott"]
Out[14]:
RowNumber | CustomerId | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Surname | |||||||||||||
Abazu | 3272 | 15708791 | 584 | Spain | Male | 32 | 9 | 85534.83 | 1 | 0 | 0 | 169137.24 | 0 |
Abazu | 5110 | 15576156 | 710 | Spain | Female | 28 | 6 | 0.00 | 1 | 1 | 0 | 48426.98 | 0 |
Abbie | 842 | 15737792 | 818 | France | Female | 31 | 1 | 186796.37 | 1 | 0 | 0 | 178252.63 | 0 |
Abbott | 2989 | 15684801 | 689 | France | Male | 47 | 1 | 93871.95 | 3 | 1 | 0 | 156878.42 | 1 |
Abbott | 4574 | 15693906 | 645 | France | Female | 24 | 3 | 34547.82 | 1 | 1 | 1 | 11638.17 | 0 |
Abbott | 6839 | 15680804 | 850 | France | Male | 29 | 6 | 0.00 | 2 | 1 | 1 | 10672.54 | 0 |
Abbott | 2538 | 15723706 | 573 | France | Female | 33 | 0 | 90124.64 | 1 | 1 | 0 | 137476.71 | 0 |
In [17]:
# select multiple rows
bank_df.loc[['Abbott','Zuyev']]
Out[17]:
RowNumber | CustomerId | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Surname | |||||||||||||
Abbott | 2989 | 15684801 | 689 | France | Male | 47 | 1 | 93871.95 | 3 | 1 | 0 | 156878.42 | 1 |
Abbott | 4574 | 15693906 | 645 | France | Female | 24 | 3 | 34547.82 | 1 | 1 | 1 | 11638.17 | 0 |
Abbott | 6839 | 15680804 | 850 | France | Male | 29 | 6 | 0.00 | 2 | 1 | 1 | 10672.54 | 0 |
Abbott | 2538 | 15723706 | 573 | France | Female | 33 | 0 | 90124.64 | 1 | 1 | 0 | 137476.71 | 0 |
Zuyev | 587 | 15722010 | 621 | Spain | Male | 53 | 9 | 170491.84 | 1 | 1 | 0 | 35588.07 | 1 |
Zuyev | 6974 | 15583394 | 659 | Germany | Male | 39 | 8 | 106259.63 | 2 | 1 | 1 | 198103.32 | 0 |
In [19]:
# randomly select
# Setting axis=0 means rows. setting index = 1 means columns
bank_df.sample(n=3,axis=0)
Out[19]:
RowNumber | CustomerId | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Surname | |||||||||||||
Chiebuka | 3505 | 15700710 | 490 | France | Female | 37 | 3 | 116465.53 | 1 | 0 | 1 | 24435.77 | 0 |
Kazantsev | 3238 | 15665760 | 802 | Spain | Male | 38 | 7 | 0.00 | 2 | 0 | 1 | 57764.65 | 0 |
Romero | 4420 | 15677586 | 587 | Germany | Female | 32 | 3 | 125445.04 | 2 | 1 | 1 | 130514.78 | 0 |
In [21]:
# randomly return apercentage of the rows
bank_df.sample(frac = 0.1, axis=0)
Out[21]:
RowNumber | CustomerId | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Surname | |||||||||||||
Bruno | 1903 | 15622017 | 773 | Spain | Female | 33 | 10 | 0.00 | 1 | 1 | 1 | 98820.09 | 0 |
Warren | 9581 | 15778838 | 783 | France | Male | 38 | 9 | 114135.17 | 1 | 1 | 0 | 153269.98 | 0 |
Williams | 7487 | 15593694 | 814 | France | Male | 49 | 8 | 0.00 | 2 | 0 | 0 | 157822.54 | 0 |
Y?an | 8171 | 15641366 | 599 | Germany | Male | 61 | 1 | 124737.96 | 1 | 0 | 1 | 90389.61 | 1 |
Terry | 1836 | 15583718 | 696 | Germany | Male | 38 | 6 | 142316.14 | 1 | 1 | 1 | 8018.49 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
Marshall | 1782 | 15771636 | 793 | Spain | Female | 36 | 0 | 0.00 | 1 | 0 | 0 | 148993.47 | 0 |
Mazzi | 710 | 15734886 | 686 | France | Female | 34 | 3 | 123971.51 | 2 | 1 | 0 | 147794.63 | 0 |
Lynch | 4965 | 15574761 | 466 | France | Female | 41 | 3 | 33563.95 | 2 | 1 | 0 | 178994.13 | 1 |
McDonald | 2505 | 15625942 | 619 | Spain | Female | 45 | 0 | 0.00 | 2 | 0 | 0 | 113645.40 | 0 |
Miller | 1905 | 15677382 | 625 | Spain | Female | 69 | 1 | 107569.96 | 1 | 1 | 1 | 182336.45 | 0 |
1000 rows × 13 columns
In [22]:
# columns
bank_df.loc[:,'CreditScore']
Out[22]:
Surname
Abazu 584
Abazu 710
Abbie 818
Abbott 689
Abbott 645
...
Zuev 730
Zuyev 621
Zuyev 659
Zuyeva 606
Zuyeva 724
Name: CreditScore, Length: 10000, dtype: int64
728x90
'Data Analytics with python > [Data Analysis]' 카테고리의 다른 글
[Pandas][DataFrame]S2_09_Broadcasting (0) | 2023.01.21 |
---|---|
[Pandas][DataFrame]S2_08_integer_index_Based_elements_selection (0) | 2023.01.21 |
[Pandas][DataFrame]S2_06_Column_ADDING_DELETING (0) | 2023.01.21 |
[Pandas][DataFrame]S2_05_selecting_columns (0) | 2023.01.21 |
[Pandas][DataFrame]S2_04_index_setting (0) | 2023.01.21 |
댓글