批量提取SemEval 2014 Task 4-aspect_term的xml文件为csv
数据data
格式
<sentence id="892:1">
<text>Boot time is super fast, around anywhere from 35 seconds to 1 minute.</text>
<aspectTerms>
<aspectTerm term="Boot time" polarity="positive" from="0" to="9"/>
</aspectTerms>
</sentence>
目的
把XML数据转换为csv文件
代码
import xml.etree.cElementTree as ET
import pandas as pd
def xml_csv(listlist):
xml = ['Laptop_Train.xml','Laptops_Test.xml','laptops-trial.xml',
'Restaurants_Test.xml','Restaurants_Train.xml','Restaurants-trial.xml']
csv_name = ['Laptop_Train.csv','Laptops_Test.csv','laptops-trial.csv',
'Restaurants_Test.csv','Restaurants_Train.csv','Restaurants-trial.csv']
# 解析XML文件
tree = ET.parse(xml[listlist])
root = tree.getroot()
# 提取所有sentence元素
sentences = root.findall('sentence')
# 修复提取数据的方法,处理没有<aspectTerms>子元素的情况
data=[]
# 遍历每个sentence元素
for sentence in sentences:
# 提取text内容
text = sentence.find('text').text
# 检查是否存在<aspectTerms>子元素
aspect_terms_element = sentence.find('aspectTerms')
if aspect_terms_element is not None:
# 提取aspectTerms中的所有aspectTerm元素
aspect_terms = aspect_terms_element.findall('aspectTerm')
# 提取每个aspectTerm的term和polarity
for aspect_term in aspect_terms:
term = aspect_term.get('term')
polarity = aspect_term.get('polarity')
data.append([text,term,polarity])
df = pd.DataFrame(data,columns=['text', 'term', 'polarity'])
df = df[df['polarity'].isin(['positive', 'negative', 'neutral'])]
df['polarity'] = df['polarity'].map(
{'positive': 1, 'neutral': 0, 'negative': -1})
df.to_csv(path_or_buf=csv_name[listlist],index=0)
# 生成csv
for i in range(6):
xml_csv(i)