文章目录
- 参考文章
- 1 了解ofd文件结构
- 1.1 如何打开ofd 文件
- 1.2 ofd文件结构
- 1.3 提取信息思路
- 2. 提取发票信息实现
- 2.1 目录结构
- 2.2 实体类
- 2.3 发票解析类
- 2.4 controller
- 2.5 service
参考文章
ofd发票解析
什么是ofd格式
-
ofd 格式是一种用于存储金融数据的开放格式,它可以包含各种类型的金融信息,通常一XML格式进行存储,因此我们可以使用java中的xml解析器来解析ofd文件并提取其中的数据。
-
在 java 中,我们可以使用DOM解析器来解析XML文件。然后编写代码来读取OFD 文件并解析其中的数据。
1 了解ofd文件结构
1.1 如何打开ofd 文件
- 可以把ofd文件后缀改为zip,再进行解压就可以看到文件结构了
1.2 ofd文件结构
- 解压缩 ofd 文件,正常情况下在 Doc_0 目录下有 Annots、Pages、Res、Tags、Tpls文件夹
及 Document.xml、DocumentRes.xml、PublicRes.xml 等文件;- Pages/Page_0/Content.xml :存放value信息(每一条value内容、位置、字体、字号、ID号),
- Tags/CustomTag.xml:存放“key”信息(定义的英文key及ID号,注意:个别字段有可能无“key”定义),
一般情况下根据“key”信息与value信息一一映射,可恢复票面上所有字段
以下信息有所删减
Tags/CustomTag.xml:存放“key”信息【例如InvoiceNo、IssueDate、Note】
This XML file does not appear to have any style information associated with it. The document tree is shown below.
<ofd:root xmlns:ofd="http://www.ofdspec.org/2016" version="1.0">
<ofd:InvoiceNo>
<ofd:ObjectRef PageRef="61">6922</ofd:ObjectRef>
</ofd:InvoiceNo>
<ofd:IssueDate>
<ofd:ObjectRef PageRef="61">6923</ofd:ObjectRef>
</ofd:IssueDate>
<ofd:TaxInclusiveTotalAmount>
<ofd:ObjectRef PageRef="61">6935</ofd:ObjectRef>
<ofd:ObjectRef PageRef="61">6936</ofd:ObjectRef>
</ofd:TaxInclusiveTotalAmount>
<ofd:Note>
<ofd:ObjectRef PageRef="61">6944</ofd:ObjectRef>
<ofd:ObjectRef PageRef="61">6945</ofd:ObjectRef>
<ofd:ObjectRef PageRef="61">6946</ofd:ObjectRef>
<ofd:ObjectRef PageRef="61">6947</ofd:ObjectRef>
</ofd:Note>
</ofd:root>
Pages/Page_0/Content.xml :
This XML file does not appear to have any style information associated with it. The document tree is shown below.
<ofd:Page xmlns:ofd="http://www.ofdspec.org/2016">
<ofd:Area>
<ofd:PhysicalBox>0 0 210 140</ofd:PhysicalBox>
</ofd:Area>
<ofd:Template TemplateID="1" ZOrder="Background"/>
<ofd:Content>
<ofd:Layer ID="6948">
<ofd:TextObject ID="6922" Boundary="170 10.3 38 5" Font="6919" Size="3.175">
<ofd:TextCode X="0" Y="3.6414" DeltaX="g 19 1.5875">发票编号</ofd:TextCode>
</ofd:TextObject>
<ofd:TextObject ID="6923" Boundary="170 16.4 38 5" Font="6919" Size="3.175">
<ofd:TextCode X="0" Y="3.6414" DeltaX="g 4 1.5875 3.175 g 2 1.5875 3.175 g 2 1.5875">发票日期</ofd:TextCode>
</ofd:TextObject>
<ofd:TextObject ID="6934" Boundary="62.9471 96 82.5189 7.4102" Font="6919" Size="3.175">
<ofd:TextCode X="0" Y="4.8465" DeltaX="g 5 3.175">发票金额中文</ofd:TextCode>
</ofd:TextObject>
<ofd:TextObject ID="6935" Boundary="155.5164 96.0681 48.665 7.6749" Font="6925" Size="3.8806">
<ofd:TextCode X="0" Y="4.9221">¥</ofd:TextCode>
</ofd:TextObject>
<ofd:TextObject ID="6936" Boundary="155.5164 96.0681 48.665 7.6749" Font="6919" Size="3.8806">
<ofd:TextCode X="2.3284" Y="4.9221" DeltaX="g 6 1.9403">发票金额阿拉伯数字</ofd:TextCode>
</ofd:TextObject>
<ofd:TextObject ID="6944" Boundary="11.1083 104.5369 193 3.3" Font="6919" Size="3.175">
<ofd:TextCode X="0" Y="2.7273" DeltaX="g 6 3.175 1.5875 g 10 3.175 g 5 1.5875 g 4 3.175 g 20 1.5875">购方开户银行:XXX支行; 银行账号:1234567890; </ofd:TextCode>
</ofd:TextObject>
<ofd:TextObject ID="6945" Boundary="11.1083 107.8369 193 3.3" Font="6919" Size="3.175">
<ofd:TextCode X="0" Y="2.7273" DeltaX="g 6 3.175 1.5875 g 16 3.175 g 5 1.5875 g 4 3.175 g 24 1.5875">销方开户银行:XXX支行; 银行账号:1234567890; </ofd:TextCode>
</ofd:TextObject>
<ofd:TextObject ID="6946" Boundary="11.1083 111.1369 193 3.3" Font="6919" Size="3.175">
<ofd:TextCode X="0" Y="2.7273" DeltaX="g 3 1.5875"> </ofd:TextCode>
</ofd:TextObject>
<ofd:TextObject ID="6947" Boundary="11.1083 114.4369 193 3.3" Font="6919" Size="3.175">
<ofd:TextCode X="0" Y="2.7273" DeltaX="g 3 3.175 1.5875 g 2 3.175 g 5 1.5875 g 3 3.175 1.5875 g 2 3.175 g 4 1.5875">收款人:XX; 复核人:XX; </ofd:TextCode>
</ofd:TextObject>
<ofd:ImageObject ID="6921" CTM="20 0 0 20 0 0" Boundary="6.5 6 20 20" ResourceID="6920"/>
</ofd:Layer>
</ofd:Content>
</ofd:Page>
1.3 提取信息思路
- 读取 Tags/CustomTag.xml 文件根据“key”信息【例如InvoiceNo、IssueDate、Note】遍历其中的标签获得Pages/Page_0/Content.xml 文件中对应的 ID 的值如 6922
- 扫描 Pages/Page_0/Content.xml 文件,根据之前获得的 ID 的值,获得想要提取的内容
2. 提取发票信息实现
2.1 目录结构
添加Maven依赖
<dependency>
<groupId>org.dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>2.1.3</version>
</dependency>
2.2 实体类
仅有一个实体类Invoice
package com.example.ofd.entity;
import lombok.Data;
@Data
public class Invoice {
private String invoiceNo;// 发票编号
private String issueDate;// 开票日期
private String totalAmount;// 开票金额
private String note;//开票备注
}
2.3 发票解析类
package com.example.ofd.utils;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigDecimal;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import com.example.ofd.entity.Invoice;
import lombok.extern.slf4j.Slf4j;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.springframework.util.StreamUtils;
import org.springframework.web.multipart.MultipartFile;
/**
* 专用于处理电子发票识别的类
*/
@Slf4j
public class OfdInvoice {
/**
* 调用该方法将前端接受到的文件暂存
*
* @param file
*/
public static Invoice parseOfdFile(MultipartFile file) {
Invoice invoice = new Invoice();
// 先判断提交上来的文件是什么类型的
String originalFilename = file.getOriginalFilename();
try {
// 创建一个临时文件
Path tempFile = null;
if (originalFilename.toLowerCase().endsWith(".ofd")) {
tempFile = Files.createTempFile("tempPrefix", ".ofd");
} else if (originalFilename.toLowerCase().endsWith(".pdf")) {
tempFile = Files.createTempFile("tempPrefix", ".pdf");
}
File tempFilePath = tempFile.toFile();
// 将MultipartFile的内容写入到临时文件
try (FileOutputStream fos = new FileOutputStream(tempFilePath)) {
fos.write(file.getBytes());
}
// 使用临时文件的路径来调用你的解析方法
invoice = extract(tempFilePath);
// 删除临时文件,或者在某些情况下保留它
tempFilePath.delete();
} catch (Exception e) {
// 处理异常
e.printStackTrace();
}
// 返回值
return invoice;
}
/**
* 从一个ZIP 文件中提取特定格式的发票信息,并构建一个 Invoice 对象来存储这些信息
*
* @param file
* @return
* @throws IOException
* @throws DocumentException
*/
public static Invoice extract(File file) throws IOException, DocumentException {
// 打开Zip文件
ZipFile zipFile = new ZipFile(file);
// 获取Zip条目
ZipEntry entry = zipFile.getEntry("Doc_0/Tags/CustomTag.xml"); // 标签文件,在本文件中根据key信息,获取到标签的ID
ZipEntry entry1 = zipFile.getEntry("Doc_0/Pages/Page_0/Content.xml"); // 本文件中存放的是value信息,根据上个id来查找各自的value
// 读取XML文件内容
InputStream input = zipFile.getInputStream(entry);
InputStream input1 = zipFile.getInputStream(entry1);
String body = StreamUtils.copyToString(input, Charset.forName("utf-8")); // 读取xml文件的内容,同时指定字符集为UTF-8
String content = StreamUtils.copyToString(input1, Charset.forName("utf-8"));
// 关闭ZIP文件
zipFile.close();
Map<String, List<String>> map = new HashMap<>();
// 解析 CustomTag.xml 文件
Document document = DocumentHelper.parseText(body); // 解析 CustomTag.xml 的内容,并获取根元素
Element root = document.getRootElement(); // 获取根元素
// 发票编号
Element invoiceNo = root.element("InvoiceNo");// 获取InvoiceNo元素
if (invoiceNo != null) {
Element objectRef = invoiceNo.element("ObjectRef");// 获取其下子标签
if (objectRef != null) {
String invoiceNumber = objectRef.getTextTrim();// 访问ObjectRef的文本内容
List<String> tmp = new ArrayList<>();// 将文本内容添加到Map中
tmp.add(invoiceNumber);
map.put("InvoiceNumber", tmp);
}
}
// 开票日期
Element issueDate = root.element("IssueDate");// 获取InvoiceNo元素
if (issueDate != null) {
Element objectRef = issueDate.element("ObjectRef");// 获取其下子标签
if (objectRef != null) {
String invoiceDate = objectRef.getTextTrim();// 访问ObjectRef的文本内容
// 将文本内容添加到Map中
List<String> tmp = new ArrayList<>();
tmp.add(invoiceDate);
map.put("invoiceDate", tmp);
}
}
// 开票金额【其下有两个子元素,第二个是想要的标签】
Element totalAmount = root.element("TaxInclusiveTotalAmount");// 获取InvoiceNo元素
if (totalAmount != null) {
// 遍历InvoiceNo下的所有子元素
for (Iterator<Element> it = totalAmount.elementIterator(); it.hasNext(); ) {
Element element = it.next();
if (it.hasNext() == false) {//只要最后一个标签
// 检查子元素是否是ObjectRef
if ("ObjectRef".equals(element.getName())) {
// 访问ObjectRef的文本内容
String amount = element.getTextTrim();
// 将文本内容添加到Map中
List<String> tmp = new ArrayList<>();
tmp.add(amount);
map.put("totalAmount", tmp);
}
}
}
}
// 开票备注【其下有四条信息,都需要】
Element note = root.element("Note");// 获取InvoiceNo元素
if (note != null) {
List<String> noteTmp = new ArrayList<>();
for (Iterator<Element> it = note.elementIterator(); it.hasNext(); ) {// 遍历InvoiceNo下的所有子元素
Element element = it.next();
// 检查子元素是否是ObjectRef
if ("ObjectRef".equals(element.getName())) {
// 访问ObjectRef的文本内容
String tmpNote = element.getTextTrim();
// 将文本内容添加到List数组中
noteTmp.add(tmpNote);
}
}
map.put("note", noteTmp);// 将文本内容添加到map中
}
// 根据id从content.xml中提取必要信息
Invoice invoice = new Invoice();// 先创建一个发票实例,将后续的到的值填充进去
Document contentDocument = DocumentHelper.parseText(content);
Element contentRoot = contentDocument.getRootElement();// 获取根元素
for (Map.Entry<String, List<String>> entrySet : map.entrySet()) {// 遍历map
String key = entrySet.getKey(); //获得当前key
if (key.equals("InvoiceNumber")) {// 发票号码
invoice.setInvoiceNo(getContent(contentRoot, entrySet.getValue().get(0)));
} else if (key.equals("invoiceDate")) {// 开票日期
invoice.setIssueDate(getContent(contentRoot, entrySet.getValue().get(0)));
} else if (key.equals("totalAmount")) {// 开票金额
invoice.setTotalAmount(getContent(contentRoot, entrySet.getValue().get(0)));
} else if (key.equals("note")) {// 发票备注
String detail = "";
for (int i = 0; i < entrySet.getValue().size(); i++) {
detail += getContent(contentRoot, entrySet.getValue().get(i));
}
invoice.setNote(detail);
}
}
return invoice;
}
/**
* 根据id,获取root中的文本
*
* @param root xml中的根元素
* @param id 存储有id
* @return
*/
public static String getContent(Element root, String id) {
// 遍历Content元素
Element content = root.element("Content");
if (content != null) {
// 遍历所有Layer元素
for (Element layer : content.elements("Layer")) {
// 遍历Layer下的所有TextObject元素
for (Element textObject : layer.elements("TextObject")) {
// 检查TextObject的ID
String textObjectId = textObject.attributeValue("ID");
if (id.equals(textObjectId)) {
// 找到匹配的TextObject,现在遍历其下的TextCode元素
for (Element textCode : textObject.elements("TextCode")) {
// 获取TextCode的文本内容
String text = textCode.getTextTrim();
return text;
}
}
}
}
}
return null;
}
}
2.4 controller
package com.example.ofd.controller;
import com.example.ofd.entity.Invoice;
import com.example.ofd.service.InvoiceService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;
@RestController
@RequestMapping("/invoice")
public class InvoiceController {
@Autowired
InvoiceService invoiceService;
/**
* @param
*/
@CrossOrigin(origins = "http://localhost:8081", allowedHeaders = "*", allowCredentials = "true")
@PostMapping("/upload")
public ResponseEntity<Object> uploadFile(@RequestParam("file") MultipartFile file) {
try {
// 调用你的文件解析服务
Invoice parsedData = invoiceService.parseOfdFile(file);
// 返回解析后的数据
return ResponseEntity.ok(parsedData);
} catch (Exception e) {
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("Error parsing file");
}
}
}
2.5 service
InvoiceServiceImpl
package com.example.ofd.service.impl;
import com.example.ofd.entity.Invoice;
import com.example.ofd.service.InvoiceService;
import com.example.ofd.utils.OfdInvoice;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;
@Service
public class InvoiceServiceImpl implements InvoiceService {
@Override
public Invoice parseOfdFile(MultipartFile file) {
Invoice invoice = OfdInvoice.parseOfdFile(file);
return invoice;
}
}
InvoiceService
package com.example.ofd.service;
import com.example.ofd.entity.Invoice;
import org.springframework.web.multipart.MultipartFile;
public interface InvoiceService {
Invoice parseOfdFile(MultipartFile file);
}