A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
dtypes:
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
A B C D
2013-01-01 -0.818896 -0.409184 0.087447 -0.935887
2013-01-02 0.681543 -1.390976 2.013105 0.644468
2013-01-03 1.017911 0.033224 -0.103912 0.634459
A B C D
2013-01-02 0.681543 -1.390976 2.013105 0.644468
2013-01-03 1.017911 0.033224 -0.103912 0.634459
2013-01-04 -0.450437 0.501915 1.003776 0.691249
Selection by Label
1
print(df.loc[dates[0]])
A -0.818896
B -0.409184
C 0.087447
D -0.935887
Name: 2013-01-01 00:00:00, dtype: float64
1
print(df.loc["20130102":"20130104", ["A","B"]])
A B
2013-01-02 0.681543 -1.390976
2013-01-03 1.017911 0.033224
2013-01-04 -0.450437 0.501915
1
print(df.loc["20130102", ["A","B"]])
A 0.681543
B -1.390976
Name: 2013-01-02 00:00:00, dtype: float64
1 2
print(df.loc[dates[0], "A"]) print(df.at[dates[0],"A"]) # equivalent to the prior method
-0.818895566676464
-0.818895566676464
Selection by Position
1
print(df.iloc[3])
A -0.450437
B 0.501915
C 1.003776
D 0.691249
Name: 2013-01-04 00:00:00, dtype: float64
1
print(df.iloc[3:5, 0:2])
A B
2013-01-04 -0.450437 0.501915
2013-01-05 1.633764 0.324234
1
print(df.iloc[[1,2,5],[1,3]])
B D
2013-01-02 -1.390976 0.644468
2013-01-03 0.033224 0.634459
2013-01-06 -0.922663 0.505433
1
print(df.iloc[1:3,:])
A B C D
2013-01-02 0.681543 -1.390976 2.013105 0.644468
2013-01-03 1.017911 0.033224 -0.103912 0.634459
1
print(df.iloc[:,1:3])
B C
2013-01-01 -0.409184 0.087447
2013-01-02 -1.390976 2.013105
2013-01-03 0.033224 -0.103912
2013-01-04 0.501915 1.003776
2013-01-05 0.324234 -1.707570
2013-01-06 -0.922663 -1.641314
1 2
print(df.iloc[1,1]) print(df.iat[1,1]) # equivalent to the prior method
-1.3909764417520816
-1.3909764417520816
Boolean Indexing
1
print(df[df["A"]>0])
A B C D
2013-01-02 0.681543 -1.390976 2.013105 0.644468
2013-01-03 1.017911 0.033224 -0.103912 0.634459
2013-01-05 1.633764 0.324234 -1.707570 1.163615
2013-01-06 0.282402 -0.922663 -1.641314 0.505433
1
print(df[df>0])
A B C D
2013-01-01 NaN NaN 0.087447 NaN
2013-01-02 0.681543 NaN 2.013105 0.644468
2013-01-03 1.017911 0.033224 NaN 0.634459
2013-01-04 NaN 0.501915 1.003776 0.691249
2013-01-05 1.633764 0.324234 NaN 1.163615
2013-01-06 0.282402 NaN NaN 0.505433
A B C D E
2013-01-01 -0.818896 -0.409184 0.087447 -0.935887 one
2013-01-02 0.681543 -1.390976 2.013105 0.644468 one
2013-01-03 1.017911 0.033224 -0.103912 0.634459 two
2013-01-04 -0.450437 0.501915 1.003776 0.691249 three
2013-01-05 1.633764 0.324234 -1.707570 1.163615 four
2013-01-06 0.282402 -0.922663 -1.641314 0.505433 three
A B C D E
2013-01-03 1.017911 0.033224 -0.103912 0.634459 two
2013-01-05 1.633764 0.324234 -1.707570 1.163615 four
2013-01-02 1
2013-01-03 2
2013-01-04 3
2013-01-05 4
2013-01-06 5
2013-01-07 6
Freq: D, dtype: int64
A B C D F
2013-01-01 -0.818896 -0.409184 0.087447 -0.935887 NaN
2013-01-02 0.681543 -1.390976 2.013105 0.644468 1.0
2013-01-03 1.017911 0.033224 -0.103912 0.634459 2.0
2013-01-04 -0.450437 0.501915 1.003776 0.691249 3.0
2013-01-05 1.633764 0.324234 -1.707570 1.163615 4.0
2013-01-06 0.282402 -0.922663 -1.641314 0.505433 5.0
1 2 3 4
df.at[dates[0], "A"] = 0# setting values by label df.iat[0,1] = 0# setting values by position df.loc[:, "D"] = np.array([5] * len(df)) # setting by assigning with a NumPy array print(df)
A B C D F
2013-01-01 0.000000 0.000000 0.087447 5 NaN
2013-01-02 0.681543 -1.390976 2.013105 5 1.0
2013-01-03 1.017911 0.033224 -0.103912 5 2.0
2013-01-04 -0.450437 0.501915 1.003776 5 3.0
2013-01-05 1.633764 0.324234 -1.707570 5 4.0
2013-01-06 0.282402 -0.922663 -1.641314 5 5.0
A B C D F E
2013-01-01 0.000000 0.000000 0.087447 5 NaN 1.0
2013-01-02 0.681543 -1.390976 2.013105 5 1.0 1.0
2013-01-03 1.017911 0.033224 -0.103912 5 2.0 NaN
2013-01-04 -0.450437 0.501915 1.003776 5 3.0 NaN
1 2 3
print(df1.dropna(how="any")) # how="any" (default) : where any NA values are present print() print(df1.dropna(how="all")) # how="all" : where all values are NA
A B C D F E
2013-01-02 0.681543 -1.390976 2.013105 5 1.0 1.0
A B C D F E
2013-01-01 0.000000 0.000000 0.087447 5 NaN 1.0
2013-01-02 0.681543 -1.390976 2.013105 5 1.0 1.0
2013-01-03 1.017911 0.033224 -0.103912 5 2.0 NaN
2013-01-04 -0.450437 0.501915 1.003776 5 3.0 NaN
A B C D F
2013-01-01 0.000000 0.000000 0.087447 5 NaN
2013-01-02 0.681543 -1.390976 2.013105 5 1.0
2013-01-03 1.017911 0.033224 -0.103912 5 2.0
2013-01-04 -0.450437 0.501915 1.003776 5 3.0
2013-01-05 1.633764 0.324234 -1.707570 5 4.0
2013-01-06 0.282402 -0.922663 -1.641314 5 5.0
2013-01-01 NaN
2013-01-02 NaN
2013-01-03 1.0
2013-01-04 3.0
2013-01-05 4.0
2013-01-06 NaN
Freq: D, dtype: float64
A B C D F
2013-01-01 NaN NaN NaN NaN NaN
2013-01-02 NaN NaN NaN NaN NaN
2013-01-03 0.017911 -0.966776 -1.103912 4.0 1.0
2013-01-04 -3.450437 -2.498085 -1.996224 2.0 0.0
2013-01-05 -2.366236 -3.675766 -5.707570 1.0 0.0
2013-01-06 NaN NaN NaN NaN NaN
Apply
1
print(df.apply(np.cumsum))
A B C D F
2013-01-01 0.000000 0.000000 0.087447 5 NaN
2013-01-02 0.681543 -1.390976 2.100552 10 1.0
2013-01-03 1.699454 -1.357753 1.996640 15 3.0
2013-01-04 1.249017 -0.855837 3.000416 20 6.0
2013-01-05 2.882781 -0.531603 1.292846 25 10.0
2013-01-06 3.165183 -1.454266 -0.348468 30 15.0
1
print(df.apply(lambda x: x.max() - x.min()))
A 2.084201
B 1.892892
C 3.720675
D 0.000000
F 4.000000
dtype: float64
Histogramming
1 2 3 4
s = pd.Series(np.random.randint(0, 7, size=10)) print(s) print() print(s.value_counts())
A B C D
0 foo one 0.031321 -0.238988
1 bar one -0.626332 -0.445851
2 foo two -1.448981 1.262838
3 bar three -0.424664 -0.639157
4 foo two 1.547849 2.378992
5 bar two -0.351304 -0.521492
6 foo one -1.903777 1.998802
7 foo three -0.613947 -0.422391
1
print(df.groupby("A").sum())
C D
A
bar -1.402300 -1.606500
foo -2.387536 4.979252
1
print(df.groupby(["A","B"]).sum())
C D
A B
bar one -0.626332 -0.445851
three -0.424664 -0.639157
two -0.351304 -0.521492
foo one -1.872457 1.759814
three -0.613947 -0.422391
two 0.098868 3.641830
C_min C_max
A B
bar one -0.626332 -0.626332
three -0.424664 -0.424664
two -0.351304 -0.351304
foo one -1.903777 0.031321
three -0.613947 -0.613947
two -1.448981 1.547849
Reshaping
Stack
stack() : Compress a level in the DataFrame’s columns:
A B
idx_1 idx_2
bar one -0.748802 0.560048
two -0.214015 -0.658540
baz one -1.968829 -0.806776
two -1.314742 -0.174498
idx_1 bar baz
idx_2
one A -0.748802 -1.968829
B 0.560048 -0.806776
two A -0.214015 -1.314742
B -0.658540 -0.174498
idx_2 one two
idx_1
bar A -0.748802 -0.214015
B 0.560048 -0.658540
baz A -1.968829 -1.314742
B -0.806776 -0.174498
A B C D E
0 one aa foo 1.055556 -0.342298
1 one bb foo -0.463657 -0.004332
2 two cc foo 0.953746 0.690613
3 three aa bar -0.980697 0.498251
4 one bb bar -0.352120 -0.503475
5 one cc bar 0.298470 -1.316212
6 two aa foo 0.580929 0.483970
7 three bb foo 0.391527 0.200354
8 one cc foo -1.314898 -1.183403
9 one aa bar -0.058855 -0.004713
10 two bb bar -0.253133 1.255313
11 three cc bar 0.882602 0.561369
C bar foo
A B
one aa -0.058855 1.055556
bb -0.352120 -0.463657
cc 0.298470 -1.314898
three aa -0.980697 NaN
bb NaN 0.391527
cc 0.882602 NaN
two aa NaN 0.580929
bb -0.253133 NaN
cc NaN 0.953746
0 very good
1 good
2 good
3 very good
4 very good
5 very bad
Name: grade, dtype: category
Categories (5, object): ['very bad', 'bad', 'medium', 'good', 'very good']