1.describe():查看每一列的描述性统计量
import numpy as np
import pandas as pd
df = pd. DataFrame( data= np. random. randint( 0 , 10 , size= ( 5 , 3 ) ) ,
index= list ( "ABCDE" ) ,
columns= [ "Python" , "NumPy" , "Pandas" ]
)
df
df. describe( )
Python NumPy Pandas count 5.000000 5.000000 5.000000 mean 5.600000 2.800000 5.400000 std 2.073644 2.167948 2.408319 min 4.000000 0.000000 2.000000 25% 4.000000 1.000000 4.000000 50% 5.000000 4.000000 6.000000 75% 6.000000 4.000000 7.000000 max 9.000000 5.000000 8.000000
df. describe( [ 0.01 , 0.3 , 0.4 , 0.9 , 0.99 ] )
Python NumPy Pandas count 5.000000 5.000000 5.000000 mean 5.600000 2.800000 5.400000 std 2.073644 2.167948 2.408319 min 4.000000 0.000000 2.000000 1% 4.000000 0.040000 2.080000 30% 4.200000 1.600000 4.400000 40% 4.600000 2.800000 5.200000 50% 5.000000 4.000000 6.000000 90% 7.800000 4.600000 7.600000 99% 8.880000 4.960000 7.960000 max 9.000000 5.000000 8.000000
df. describe( [ 0.01 , 0.3 , 0.4 , 0.9 , 0.99 ] ) . T
count mean std min 1% 30% 40% 50% 90% 99% max Python 5.0 5.6 2.073644 4.0 4.00 4.2 4.6 5.0 7.8 8.88 9.0 NumPy 5.0 2.8 2.167948 0.0 0.04 1.6 2.8 4.0 4.6 4.96 5.0 Pandas 5.0 5.4 2.408319 2.0 2.08 4.4 5.2 6.0 7.6 7.96 8.0
2.df.std():可以求得DataFrame对象每一列的标准差
df. std( )
Python 2.073644
NumPy 2.167948
Pandas 2.408319
dtype: float64
3.df.drop():删除特定索引
df2 = df. copy( )
df2
Python NumPy Pandas A 9 0 8 B 5 1 2 C 6 5 7 D 4 4 6 E 4 4 4
df2. drop( "A" )
Python NumPy Pandas B 5 1 2 C 6 5 7 D 4 4 6 E 4 4 4
df2. drop( index= "A" )
Python NumPy Pandas B 5 1 2 C 6 5 7 D 4 4 6 E 4 4 4
df2. drop( "Python" , axis= 1 )
NumPy Pandas A 0 8 B 1 2 C 5 7 D 4 6 E 4 4
df2. drop( columns= "Python" )
NumPy Pandas A 0 8 B 1 2 C 5 7 D 4 6 E 4 4
df2. drop( columns= [ "NumPy" , "Python" ] )
df2. drop( index= [ "A" , "B" ] )
Python NumPy Pandas C 6 5 7 D 4 4 6 E 4 4 4
df2. drop( index= [ "A" , "B" ] , inplace= True )
df2
Python NumPy Pandas C 6 5 7 D 4 4 6 E 4 4 4
4.unique():唯一,去重(只能用于Series一维数组)
df[ "Python" ] . unique( )
array([9, 5, 6, 4])
5.df.query:按条件查询
df. query( "Python == 6" )
df. query( "Python > 6" )
df. query( "Python < 6" )
Python NumPy Pandas B 5 1 2 D 4 4 6 E 4 4 4
df. query( "Python > 4 and NumPy == 5" )
df. query( "Python > 4 & NumPy == 5" )
df. query( "Python > 6 or NumPy == 8" )
df. query( "Python > 6 | NumPy == 8" )
df. query( "Python in [5,6,9]" )
Python NumPy Pandas A 9 0 8 B 5 1 2 C 6 5 7
n = 6
df. query( "Python == @n" )
m = [ 5 , 6 , 9 ]
df. query( "Python in @m" )
Python NumPy Pandas A 9 0 8 B 5 1 2 C 6 5 7
6.df.sort_values():根据值排序
df. sort_values( "Python" )
Python NumPy Pandas D 4 4 6 E 4 4 4 B 5 1 2 C 6 5 7 A 9 0 8
df. sort_values( "Python" , ascending= False )
Python NumPy Pandas A 9 0 8 C 6 5 7 B 5 1 2 D 4 4 6 E 4 4 4
df. sort_values( "B" , axis= 1 )
NumPy Pandas Python A 0 8 9 B 1 2 5 C 5 7 6 D 4 6 4 E 4 4 4
7.df.sort_index():根据索引排序
df. sort_index( ascending= False )
Python NumPy Pandas E 4 4 4 D 4 4 6 C 6 5 7 B 5 1 2 A 9 0 8
df. sort_index( ascending= False , axis= 1 )
Python Pandas NumPy A 9 8 0 B 5 2 1 C 6 7 5 D 4 6 4 E 4 4 4
8.df.info():查看数据信息
df. info( )
<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, A to E
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Python 5 non-null int32
1 NumPy 5 non-null int32
2 Pandas 5 non-null int32
dtypes: int32(3)
memory usage: 272.0+ bytes
9.练习
新建一个形状为10000*3的标准正态分布的DataFrame(np.random.randn),去除掉所有满足以下情况的行:其中任一元素绝对值大约3陪标准差
df = pd. DataFrame( np. random. randn( 10000 , 3 ) )
df
0 1 2 0 0.786386 -0.204965 -0.152465 1 0.400526 1.447733 0.310461 2 -0.363709 -0.989258 -1.093327 3 -2.856978 2.336645 1.474821 4 -0.847757 1.141278 -0.230877 ... ... ... ... 9995 0.331052 0.263059 0.469468 9996 1.172189 -1.380337 0.648793 9997 -0.544049 -0.509627 -0.224698 9998 -0.034967 -0.085575 -0.687314 9999 0.007202 -0.069250 -0.803754
10000 rows × 3 columns
cond = df. abs ( ) > df. std( ) * 3
cond
0 1 2 0 False False False 1 False False False 2 False False False 3 False False False 4 False False False ... ... ... ... 9995 False False False 9996 False False False 9997 False False False 9998 False False False 9999 False False False
10000 rows × 3 columns
cond2 = cond. any ( axis= 1 )
cond2
0 False
1 False
2 False
3 False
4 False
...
9995 False
9996 False
9997 False
9998 False
9999 False
Length: 10000, dtype: bool
df. loc[ ~ cond2]
0 1 2 0 0.786386 -0.204965 -0.152465 1 0.400526 1.447733 0.310461 2 -0.363709 -0.989258 -1.093327 3 -2.856978 2.336645 1.474821 4 -0.847757 1.141278 -0.230877 ... ... ... ... 9995 0.331052 0.263059 0.469468 9996 1.172189 -1.380337 0.648793 9997 -0.544049 -0.509627 -0.224698 9998 -0.034967 -0.085575 -0.687314 9999 0.007202 -0.069250 -0.803754
9904 rows × 3 columns