参考:
对XML文件读取和编辑2-QXmlStreamReader读取 - 知乎
https://zhuanlan.zhihu.com/p/358862429
本地环境:
win10专业版,64位,Qt 5.12
代码已测试通过。
问题描述
需要按字节读取一个文档,解析其中具有xml格式的部分,并存储到一个Hash表中,方便使用。xml标签上可能带有属性信息,如下图红线所示:
解决思路
按字节读取参考:
qt5-入门-QByteArray-CSDN博客
https://blog.csdn.net/pxy7896/article/details/137583591
提取后发现文字内容大概如下:
"<Notes><UUID>71bf0eb6-0477-41e8-8520-f1f5fafac932</UUID><Type>Synthetic</Type><ConfirmedExperimentally>0</ConfirmedExperimentally><CustomMapLabel>Y14837</CustomMapLabel><UseCustomMapLabel>1</UseCustomMapLabel><Description>Cloning vector pUC57, complete sequence.</Description><Created UTC=\"1:41:49\">2020.7.2</Created><LastModified UTC=\"1:8:0\">2021.10.8</LastModified><AccessionNumber>Y14837</AccessionNumber><SequenceClass>UNA</SequenceClass><TransformedInto>unspecified</TransformedInto><References><Reference authors=\"Markausakas A, Dreguniene G.\" journal=\"Unpublished\" title=\"A new cloning vector pUC57\"/><Reference authors=\"Markauskas A.\" journal=\"Submitted (16-SEP-1997) A. Markauskas, Fermentas AB, Graiciuno 8, Vilnius 2028, LITHUANIA\" title=\"Direct Submission\"/></References><Comments><a href='http://www.informaxinc.com/'>http://www.informaxinc.com/</a><br>ORIGDB|GenBank</Comments></Notes>"
可以看到,<Reference>
不仅携带属性信息,还可能重复,所以应该使用QMultiHash
。
总的设计思路是:当QXmlStreamReader::TokenType
是QXmlStreamReader::StartElement
读取标签名称和属性信息,存储到合适的字典里;当是QXmlStreamReader::Characters
时读取标签内容;当是QXmlStreamReader::EndElement
时,存储到外层字典中,并清空临时值。这样一直读取到这部分结束。
实现
void process(QXmlStreamReader& xml, QMultiHash<QString, QHash<QString, QString>>& hash) {
// 临时存储
QString name, value;
QHash<QString, QString> attrHash;
while(!xml.atEnd()) {
QXmlStreamReader::TokenType token = xml.readNext();
switch ((int)token) {
case QXmlStreamReader::NoToken:
//qDebug()<<"没有读到任何东西";
break;
case QXmlStreamReader::Invalid:
//qDebug()<<"发生错误,在error()和errorString()中报告.";
break;
case QXmlStreamReader::StartDocument:
//qDebug()<<"读取文件开始-"<<"版本号:"<<xml.documentVersion()<<"编码格式:"<<xml.documentEncoding();
break;
case QXmlStreamReader::EndDocument:
//qDebug()<<"读取文件结束";
break;
case QXmlStreamReader::StartElement: //开始读取一个元素
{
// 如果是元素开始标签
name = xml.name().toString();
// 输出标签的属性
QXmlStreamAttributes attributes = xml.attributes();
// 此时有属性,需要填充字典
if (!attributes.isEmpty()) {
foreach (const QXmlStreamAttribute &attribute, attributes) {
attrHash.insert(attribute.name().toString(), attribute.value().toString());
}
}
}
break;
case QXmlStreamReader::EndElement: //读取一个元素结束
{
if(name == xml.name().toString()) {
//attrHash.insert("name", name);
attrHash.insert("value", value);
hash.insert(name, attrHash);
}
// 清空
name = "";
value = "";
attrHash.clear();
}
break;
case QXmlStreamReader::Characters: //读取元素中的文本信息
{
QString str = xml.text().toString();
if(!xml.isWhitespace())
{
value = str;
}
}
break;
case QXmlStreamReader::Comment: //文本注释
break;
case QXmlStreamReader::ProcessingInstruction:
//qDebug()<<"ProcessingInstruction: "<< xml.text();
break;
}
} // 读取结束
}
使用:
QString blockContent = byteArray.mid(ptr, blockSize);
// 原始bytes中可能有\n,注意去掉。。。
QXmlStreamReader xml(blockContent);
QMultiHash<QString, QHash<QString, QString> > curHash;
process(xml, curHash);
// 打印一下结果
for (QMultiHash<QString, QHash<QString, QString>>::const_iterator it = curHash.constBegin(); it != curHash.constEnd(); ++it) {
qDebug() << it.key() << it.value() << endl;
}
/* 解析结果
"Created" QHash(("value", "2020.7.2")("UTC", "1:41:49"))
"LastModified" QHash(("value", "2021.10.8")("UTC", "1:8:0"))
"Comments" QHash(("value", "<a href='http://www.informaxinc.com/'>http://www.informaxinc.com/</a><br>ORIGDB|GenBank"))
"Type" QHash(("value", "Synthetic"))
"Description" QHash(("value", "Cloning vector pUC57, complete sequence."))
"CustomMapLabel" QHash(("value", "Y14837"))
"UseCustomMapLabel" QHash(("value", "1"))
"ConfirmedExperimentally" QHash(("value", "0"))
"SequenceClass" QHash(("value", "UNA"))
"UUID" QHash(("value", "71bf0eb6-0477-41e8-8520-f1f5fafac932"))
"TransformedInto" QHash(("value", "unspecified"))
"Reference" QHash(("value", "")("journal", "Submitted (16-SEP-1997) A. Markauskas, Fermentas AB, Graiciuno 8, Vilnius 2028, LITHUANIA")("authors", "Markauskas A.")("title", "Direct Submission"))
"Reference" QHash(("value", "")("journal", "Unpublished")("authors", "Markausakas A, Dreguniene G.")("title", "A new cloning vector pUC57"))
"AccessionNumber" QHash(("value", "Y14837"))
*/