1:分组 t.groupby
#coding:utf-8
import pandas as pd
import numpy as np
file_path='./starbucks_store_worldwide.csv'
df=pd.read_csv(file_path)
#print(df.head(1))
#print(df.info())
grouped=df.groupby(by='Country')
print(grouped)
#DataFrameGroupBy
#可以遍历,也可以使用聚合方法
2:DataFrameGroupBy可以进行遍历
grouped=df.groupby(by='Country')
print(grouped)
#DataFrameGroupBy
#可以遍历
for i, j in grouped:
print(i)
print('_'*100)
print(j,type(j))
print('*'*100)
3:DateFrameGroupBy可以聚合
print(grouped.count()),可以对grouped进行统计操作
country_count=grouped['Brand'].count()
print(country_count['CN'])
print(country_count['US'])
4:统计中国每个省份店铺的数量
#coding:utf-8
import pandas as pd
import numpy as np
file_path='./starbucks_store_worldwide.csv'
df=pd.read_csv(file_path)
china_date=df[df['Country']=='CN']
#print(china_date)
grouped=china_date.groupby(by='City').count()['Brand']
print(grouped)
5:按照多条件进行分组
#coding:utf-8
import pandas as pd
import numpy as np
file_path='./starbucks_store_worldwide.csv'
df=pd.read_csv(file_path)
china_date=df[df['Country']=='CN']
#print(china_date)
#grouped=china_date.groupby(by='City').count()['Brand']
grouped=df['Brand'].groupby(by=[df['Country'],df['State/Province']]).count()
print(grouped)
print(type(grouped))
6:df['Brand']和df[['Brand']]一个代表Series格式,一个代表DateFrame格式
#coding:utf-8
import pandas as pd
import numpy as np
file_path='./starbucks_store_worldwide.csv'
df=pd.read_csv(file_path)
china_date=df[df['Country']=='CN']
#print(china_date)
#grouped=china_date.groupby(by='City').count()['Brand']
grouped=df['Brand'].groupby(by=[df['Country'],df['State/Province']]).count()
print(grouped)
print(type(grouped))
7:索引和复合索引
#把某一列作为索引df.set_index
#重置索引 df.index=['x','y']
df1=pd.DataFrame(np.ones(8).reshape(2,4))
df1.index=['a','b']
# df1.reindex['a','f']
# print(df1)
df1.columns=['c','d','e','f']
#print(df1)
df2=df1.set_index('c')
print(df2)
df2=df1.set_index('c',drop=False)
#c不止是索引,仍然是列
print(df2)
#index.unique
df2=df1.set_index('c',drop=False).index.unique()
print(df2)
#index是可迭代的对象,可以len( ),也可以list()
df2=len(df1.set_index('c',drop=False))
#c不止是索引,仍然是列
print(df2)
df2=list(df1.set_index('c',drop=False))
print(df2)
#设置2个列作为索引
#设置两个列作为索引
df3=df1.set_index(['c','d'],drop='false')
print(df3)
#简单的索引操作