目录
摘 要:
1.问题背景与问题重述
1.1 问题背景
1.2 问题重述
2.模型假设
3.符号说明
4.问题一的求解
4.1 问题分析
4.2 异常数据的处理
4.2.1 明显错误数据的处理
4.2.2 加减速异常数据的处理
4.3 缺失数据的处理
4.3.1 数据插补处理
4.3.2 视为长期停车处理
4.3.3 删除记录处理
4.4 其他不良数据的处理
4.5 数据预处理结果
5.问题二的求解
5.1 问题分析
5.2 运动学片段的划分
5.3 特征参数的选择与计算
5.4 运动学片段取
6.问题三的求解
6.1 问题分析
6.2 主成分分析
6.2.1 PCA 基本原理
6.2.2 PCA 编码实现
6.3 K-means 聚类分析
6.3.1 聚类原理
6.3.2 聚类结果与分析
6.4 行驶工况构建
6.4.1 各类运动学片时间占比计算
6.4.2 各类运动学片段选取
6.5 汽车运动特征评估
6.5.1 基于运动学片段特征参数的误差分析
6.5.2 基于油耗特征的行驶工况评价
7.模型的评价与改进
7.1 模型的优点
7.2 模型的缺点
参考文献
代码实现
问题一数据预处理 Python 代码
问题二运动学片段提取 Python 代码
问题三行驶工况构建 Python 代码
摘 要:
1.问题背景与问题重述
1.1 问题背景
1.2 问题重述
2.模型假设
3.符号说明
4.问题一的求解
4.1 问题分析
4.2 异常数据的处理
4.2.1 明显错误数据的处理
上述的异常数据在文件 1 中有 3 组,如下表所示,文件 2 中有 300 组,文件 3 中没有。
4.2.2 加减速异常数据的处理
4.3 缺失数据的处理
4.3.1 数据插补处理
4.3.2 视为长期停车处理
4.3.3 删除记录处理
4.4 其他不良数据的处理
4.5 数据预处理结果
5.问题二的求解
5.1 问题分析
5.2 运动学片段的划分
5.3 特征参数的选择与计算
5.4 运动学片段取
6.问题三的求解
6.1 问题分析
6.2 主成分分析
6.2.1 PCA 基本原理
6.2.2 PCA 编码实现
6.3 K-means 聚类分析
6.3.1 聚类原理
6.3.2 聚类结果与分析
6.4 行驶工况构建
6.4.1 各类运动学片时间占比计算
6.4.2 各类运动学片段选取
6.5 汽车运动特征评估
6.5.1 基于运动学片段特征参数的误差分析
6.5.2 基于油耗特征的行驶工况评价
7.模型的评价与改进
7.1 模型的优点
7.2 模型的缺点
参考文献
代码实现
问题一数据预处理 Python 代码
import pandas as pd
import numpy as np
import datetime
import tqdm
def segments(rawData):
seg_bound=0
n=1
maxl=0
maxb=0
count=0
while n<len(rawData)-2:
if rawData.iat[n-1,1]==0 and rawData.iat[n,1]!=0 and n-seg_bound>180:
seg_bound=n-180
n+=1
elif rawData.iat[n-1,1]==0 or rawData.iat[n,1]!=0:
n+=1
else:
if n-seg_bound>maxl:
maxl=n-seg_bound
maxb=seg_bound
count+=1
n+=1
seg_bound=n
return maxl,maxb
def checkMissing(rawData):
jump=[]
for i in range(1,len(rawData)):
diff =(rawData.iat[i, 0]-rawData.iat[i-1, 0]).total_seconds()
if diff>3:
jump.append(diff)
return jump
def checkSlow(rawData):
seg_bound=0
n=1
maxl=0
maxb=0
count=0
while n<len(rawData)-2:
if rawData.iat[n-1,1]==0 and rawData.iat[n,1]!=0 and n-seg_bound>180:
seg_bound=n-180
n+=1
elif rawData.iat[n-1,1]==0 or rawData.iat[n,1]!=0:
n+=1
else:
if n-seg_bound>maxl:
maxl=n-seg_bound
maxb=seg_bound
count+=1
n+=1
seg_bound=n
return maxl,maxb
def process(rawData):
print(list(rawData))
i=0
while i<len(rawData):
rawData.loc[i, 'GPS (m/s)']=rawData.iat[i,2]/3.6
i+=1
i=1
while i<len(rawData):
timediff = rawData.iat[i,0]-rawData.iat[i-1,0]
speeddiff = rawData.iat[i,3]-rawData.iat[i-1,3]
rawData.loc[i, ' (m/s^2)']=speeddiff/timediff
i+=1
print(rawData.head(10))
rawData.to_excel('/Users/yangkai/Downloads/questionD/ 2 .xlsx')
def removeAbnormalAcceleration(rawData):
print(' ',rawData.shape[0])
i=0
while i<len(rawData):
if rawData.iat[i,4]<-8:
rawData.drop(rawData.index[i],inplace=True)
else:
i+=1
print(' ', rawData.shape[0])
findMissing(rawData)
#rawData.to_excel('/Users/yangkai/Downloads/questionD/ 3 .xlsx')
def findStartupException(rawData):
res=[]
i=1
while i<len(rawData):
if rawData.iat[i-1,2]==0 and rawData.iat[i,2]!=0:
j=i-1
while rawData.iat[i,2]<=100 and rawData.iat[i,0]-rawData.iat[j,0]<=7:
i+=1
if rawData.iat[i,2]>100 and rawData.iat[i,0]-rawData.iat[j,0]<=7:
res.append(j)
else:
i+=1
return res
def findMissing(rawData):
res = []
for i in range(1,len(rawData)):
diff =(rawData.iat[i,1]-rawData.iat[i-1,1]).total_seconds()
if 1<diff:
print(rawData.iat[i-1,1],diff)
def findDuplicate(rawData):
res=[]
i=0
while i<len(rawData)-1:
if (rawData.iat[i+1,1]-rawData.iat[i,1]).total_seconds()==1 and rawData.iat[i,2]>10 and
rawData.iat[i+1,8]==rawData.iat[i,8] and rawData.iat[i+1,9]==rawData.iat[i,9]:
j=i
while (rawData.iat[i+1,1]-rawData.iat[i,1]).total_seconds()==1 and rawData.iat[i,2]>10 and
rawData.iat[i+1,8]==rawData.iat[i,8] and rawData.iat[i+1,9]==rawData.iat[i,9]:
i+=1
res.append([j,i+1-j])
else:
i+=1
for r in res:
print(r[0],r[1])
def removeError(rawData):
print(' ',rawData.shape[0])
i=1
while i<len(rawData):
if rawData.iat[i,3]>10 and rawData.iat[i-1,9]==rawData.iat[i,9] and
rawData.iat[i-1,10]==rawData.iat[i,10]:
rawData.drop(rawData.index[i], inplace=True)
else:
i+=1
print(' ', rawData.shape[0])
i=0
while i<len(rawData):
if rawData.iat[i,5]>4 or rawData.iat[i, 5]<-8:
rawData.drop(rawData.index[i], inplace=True)
else:
i+=1
print(' ', rawData.shape[0])
i = 0
while i<len(rawData):
if rawData.iat[i,9]==0 or rawData.iat[i,10]==0 or rawData.iat[i,3]>120:
rawData.drop(rawData.index[i], inplace=True)
else:
i+=1
print(' ', rawData.shape[0])
# rawData[' ']=rawData[' '].astype('str')
rawData.to_excel('/Users/yangkai/Downloads/questionD/ 3 .xlsx')
def Interpolation(rawData):
col = list(rawData)
print(col)
data= list(np.array(rawData))
print(len(rawData))
i=1
while i<len(data):
if 1<data[i][1]-data[i-1][1]<=3:
if data[i][1]-data[i-1][1]==2:
date=data[i-1][1]+1
raw = [0,date,0]
for j in range(3, 18):
raw.append((data[i][j]+data[i-1][j])/2)
raw[5]=raw[4]-data[i-1][4]
data[i][5]=data[i][4]-raw[4]
data.insert(i, raw)
i+=2
else:
print(data[i][1])
date1=data[i-1][1]+1
date2=data[i-1][1]+2
raw1 = [0,date1,0]
raw2 = [0,date2,0]
for j in range(3,18):
raw1.append((data[i][j]+2*data[i-1][j])/3)
raw2.append((2*data[i][j]+data[i-1][j])/3)
raw1[5]=raw1[4]-data[i-1][4]
raw2[5]=raw2[4]-raw1[4]
data[i][5]=data[i][4]-raw2[4]
data.insert(i, raw2)
data.insert(i, raw1)
i+=3
else:
i+=1
data=np.array(data)
outData = pd.DataFrame(data, columns=col)
# print(len(outData))
# outData[' ']=outData[' '].astype('str')
# outData.to_excel('/Users/yangkai/Downloads/questionD/ 3 .xlsx')
def i️dleSpeed(rawData):
i=0
print(list(rawData))
while i<len(rawData)-1:
if rawData.iat[i,3]==0 and rawData.iat[i+1,3]!=0:
i+=1
left=i
maxs=rawData.iat[i,3]
while i<len(rawData)-1:
maxs=max(maxs, rawData.iat[i,3])
if rawData.iat[i,3]!=0 and rawData.iat[i+1,3]==0:
right=i
if maxs<=10:
for j in range(left,right+1):
rawData.iat[j,3]=0
i+=1
break
else:
i+=1
else:
i+=1
rawData.to_excel('/Users/yangkai/Downloads/questionD/ 1final.xlsx')
def i️dleSpeedprocess(rawData):
i=len(rawData) - 1
print(i)
count=0
while i>=0:
if rawData.iat[i,3]==0:
count+=1
if count>180:
while rawData.iat[i,3]==0:
rawData.drop(rawData.index[i], inplace=True)
i-=1
else:
i-=1
else:
count=0
i-=1
print(len(rawData))
df = pd.read_excel('/Users/yangkai/Downloads/questionD/ 1 .xlsx')
# print(list(df))
# df[' '] = pd.to_datetime(df[' '], format='%Y/%m/%d %H:%M:%S.000.')
# begin = datetime.datetime.strptime('2017-11-01 00:00:00', '%Y-%m-%d %H:%M:%S')
# df['unix '] = df[' '].map(lambda x:(x-begin).total_seconds())
# col=list(df)
# rawData = df[col[1:-3]]
df[' '] = df[' '].astype('str')
# process(df)
# i=0
# while i<len(df):
# if df.iat[i, 9] == 0 or df.iat[i, 10] == 0 or df.iat[i, 3] > 120:
# df.drop(df.index[i], inplace=True)
# else:
# i += 1
# df.to_excel('/Users/yangkai/Downloads/questionD/ 3 .xlsx')
#df[' ']=pd.to_datetime(df[' '],format='%Y-%m-%d %H:%M:%S')
# df['DIFF']=df[' '].diff(1).dt.seconds
# df['DIFF'].fillna(0)
# print(df.head(10))
# print(len(df))
# print(list(df))
# i️dleSpeedprocess(df)
# i️dleSpeed(df)
Interpolation(df)
# removeError(df)
# removeAbnormalAcceleration(df)
问题二运动学片段提取 Python 代码
import numpy as np
import pandas as pd
def getTimepiece(df):
print(list(df))
seg=[[0]]
for i in range(len(df)-1):
if df.iat[i,3]!=0 and df.iat[i+1,3]==0:
if df.iat[i+1,1]-df.iat[i,1]==1:
seg[-1].append(i)
else:
seg.pop(-1)
seg.append([i+1])
seg.pop(-1)
i=0
while i<len(seg):
if seg[i][1]-seg[i][0]!=df.iat[seg[i][1],1]-df.iat[seg[i][0],1]:
seg.pop(i)
elif seg[i][1]-seg[i][0]>500 or seg[i][1]-seg[i][0]<0:
seg.pop(i)
else:
i+=1
for i in range(len(seg)):
count=0
for j in range(seg[i][0],seg[i][1]+1):
if df.iat[j,1]==0:
count+=1
else:
break
if count>180:
seg[i][0]=seg[i][0]+count-180
# print(' :',len(seg))
# temp=[x[1]-x[0]+1 for x in seg]
# temp.sort()
# for t in temp:
# print(t)
return seg
def extract(begin, end, data):
print(begin)
Ta=0
Td=0
Tc=0
Ti=0
S=0
vmax=0
amax=0
amin=0
jiasusum=0
jiansusum=0
xmax=0
xjz=0
ymax=0
yjz=0
zmax=0
zjz=0
zsmax=0
zsjz=0
njmax=0
njjz=0
yhmax=0
yhjz=0
tbmax=0
tbjz=0
rbmax=0
rbjz=0
fhmax=0
fhjz=0
jqmax=0
jqjz=0
jsstd=0
for i in range(begin,end+1):
jsstd+=data[i][5]**2
if data[i][5] > 0.1:
if (i>begin and data[i-1][5] > 0.1) or (i<end and data[i+1][5] > 0.1):
Ta += 1
jiasusum += data[i][5]
elif data[i][5] < -0.1:
if (i>begin and data[i-1][5] < -0.1) or (i<end and data[i+1][5] < -0.1):
Td += 1
jiansusum += data[i][5]
else:
Tc += 1
if data[i][4]==0:
Ti += 1
S += data[i][4]
vmax = max(vmax,data[i][4])
amax = max(amax,data[i][5])
amin = min(amin,data[i][5])
xmax = max(xmax,data[i][6])
xjz += data[i][6]
ymax = max(ymax,data[i][7])
yjz += data[i][7]
zmax = max(zmax,data[i][8])
zjz += data[i][8]
zsmax = max(zsmax,data[i][11])
zsjz += data[i][11]
njmax = max(njmax,data[i][12])
njjz += data[i][12]
yhmax = max(yhmax,data[i][13])
yhjz += data[i][13]
tbmax = max(tbmax,data[i][14])
tbjz += data[i][14]
rbmax = max(rbmax,data[i][15])
rbjz += data[i][15]
fhmax = max(fhmax,data[i][16])
fhjz += data[i][16]
jqmax = max(jqmax,data[i][17])
jqjz += data[i][17]
T = end-begin+1
pjsd=S/T
pjxssd=S/(T-Ti)
pjjias=jiasusum/Ta
pjjians=jiansusum/Td
sdstd=0
for i in range(begin,end+1):
sdstd += (data[i][4]-pjsd)**2
sdstd/=end-begin
sdstd=sdstd**0.5
jsstd/=end-begin-1
jsstd=jsstd**0.5
xjz /= T
yjz /= T
zjz /= T
zsjz /= T
njjz /= T
yhjz /= T
tbjz /= T
rbjz /= T
fhjz /= T
jqjz /= T
res=[data[begin][1], data[end][1], T, Ta, Td, Tc, Ti, S, vmax, amax, amin, pjsd, pjxssd,
pjjias, pjjians, sdstd, jsstd, Ta/T, Td/T, Tc/T, Ti/T, xmax, xjz, ymax, yjz, zmax,
zjz, zsmax, zsjz, njmax, njjz, yhmax, yhjz, tbmax, tbjz, rbmax, rbjz, fhmax, fhjz,
jqmax, jqjz]
print(res)
return res
def getFeature(df):
seg = getTimepiece(df)
feature=[]
source = list(np.array(df))
for s in seg:
try:
feature.append(extract(s[0],s[1],source))
except:
pass
data=np.array(feature)
col = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ',
' ', 'X ', 'X ', 'Y ', 'Y ',
'Z ', 'Z ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', '
',
' ']
outData = pd.DataFrame(data, columns=col)
outData.to_excel('/Users/yangkai/Downloads/questionD/ 3 .xlsx')
df = pd.read_excel('/Users/yangkai/Downloads/questionD/ 3final.xlsx')
getFeature(df)
问题三行驶工况构建 Python 代码
import numpy as np
import pandas as pd
def extract(begin, end, data):
print(begin)
Ta,Td,Tc,Ti,S=0,0,0,0,0
vmax,amax,amin=0,0,0
jiasusum,jiansusum=0,0
xmax,xjz=0,0
ymax,yjz=0,0
zmax,zjz=0,0
zsmax,zsjz=0,0
njmax,njjz=0,0
yhmax,yhjz=0,0
tbmax,tbjz=0,0
rbmax,rbjz=0,0
fhmax,fhjz=0,0
jqmax,jqjz=0,0
jsstd=0,0
for i in range(begin,end+1):
jsstd+=data[i][5]**2
if data[i][5] > 0.1:
if (i>begin and data[i-1][5] > 0.1) or (i<end and data[i+1][5] > 0.1):
Ta += 1
jiasusum += data[i][5]
elif data[i][5] < -0.1:
if (i>begin and data[i-1][5] < -0.1) or (i<end and data[i+1][5] < -0.1):
Td += 1
jiansusum += data[i][5]
else:
Tc += 1
if data[i][4]==0:
Ti += 1
S += data[i][4]
vmax = max(vmax,data[i][4])
amax = max(amax,data[i][5])
amin = min(amin,data[i][5])
xmax = max(xmax,data[i][6])
xjz += data[i][6]
ymax = max(ymax,data[i][7])
yjz += data[i][7]
zmax = max(zmax,data[i][8])
zjz += data[i][8]
zsmax = max(zsmax,data[i][11])
zsjz += data[i][11]
njmax = max(njmax,data[i][12])
njjz += data[i][12]
yhmax = max(yhmax,data[i][13])
yhjz += data[i][13]
tbmax = max(tbmax,data[i][14])
tbjz += data[i][14]
rbmax = max(rbmax,data[i][15])
rbjz += data[i][15]
fhmax = max(fhmax,data[i][16])
fhjz += data[i][16]
jqmax = max(jqmax,data[i][17])
jqjz += data[i][17]
T = end-begin+1
pjsd=S/T
pjxssd=S/(T-Ti)
pjjias=jiasusum/Ta
pjjians=jiansusum/Td
sdstd=0
for i in range(begin,end+1):
sdstd += (data[i][4]-pjsd)**2
sdstd/=end-begin
sdstd=sdstd**0.5
jsstd/=end-begin-1
jsstd=jsstd**0.5
xjz /= T
yjz /= T
zjz /= T
zsjz /= T
njjz /= T
yhjz /= T
tbjz /= T
rbjz /= T
fhjz /= T
jqjz /= T
res=[data[begin][1], data[end][1], T, Ta, Td, Tc, Ti, S, vmax, amax, amin, pjsd, pjxssd,
pjjias, pjjians, sdstd, jsstd, Ta/T, Td/T, Tc/T, Ti/T, xmax, xjz, ymax, yjz, zmax,
zjz, zsmax, zsjz, njmax, njjz, yhmax, yhjz, tbmax, tbjz, rbmax, rbjz, fhmax, fhjz,
jqmax, jqjz]
print(res)
return res
def evaluate(df):
T,S,Ta,Td,Tc,Ti=0,0,0,0,0,0
pjjiasd,pjjiansd=0,0
sdstd,jsdstd=0,0
xjz,yjz,zjz=0,0,0
zsjz,njjz=0,0
yhjz,tbjz=0,0
krjz,fhjz=0,0
jqjz=0
for i in range(len(df)):
T+=df.iat[i, 3]
Ta+=df.iat[i, 4]
Td+=df.iat[i, 5]
Tc+=df.iat[i, 6]
Ti+=df.iat[i, 7]
S+=df.iat[i,8]
pjjiasd+=df.iat[i,14]*df.iat[i, 4]
pjjiansd+=df.iat[i,15]*df.iat[i, 5]
sdstd+=df.iat[i,16]**2*(df.iat[i,3]-1)
jsdstd+=df.iat[i,17]**2*(df.iat[i,3]-2)
xjz+=df.iat[i,22]*df.iat[i,3]
yjz+=df.iat[i,24]*df.iat[i,3]
zjz+=df.iat[i,26]*df.iat[i,3]
zsjz+=df.iat[i,28]*df.iat[i,3]
njjz+=df.iat[i,30]*df.iat[i,3]
yhjz+=df.iat[i,32]*df.iat[i,3]
tbjz+=df.iat[i,34]*df.iat[i,3]
krjz+=df.iat[i,36]*df.iat[i,3]
fhjz+=df.iat[i,38]*df.iat[i,3]
jqjz+=df.iat[i,40]*df.iat[i,3]
print(' ',S/T)
print(' ',S/(T-Ti))
print(' ',pjjiasd/Ta)
print(' ',pjjiansd/Td)
print(' ',(sdstd/(T-1))**0.5)
print(' ', (jsdstd/(T-2))**0.5)
print(' ',Ta/T)
print(' ',Td/T)
print(' ',Tc/T)
print(' ',Ti/T)
print('X ',xjz/T)
print('Y ',yjz/T)
print('Z ',zjz/T)
print(' ',zsjz/T)
print(' ',njjz/T)
print(' ',yhjz/T)
print(' ',tbjz/T)
print(' ',krjz/T)
print(' ',fhjz/T)
print(' ',jqjz/T)
df = pd.read_excel('/Users/yangkai/Downloads/questionD/ - 123 .xlsx')
evaluate(df)