问题
今天在 Hive 中 get_json_object
函数解析 json 串的时候,发现函数不支持解析 json 中文 key。
例如:
select get_json_object('{ "姓名":"张三" , "年龄":"18" }', '$.姓名');
我们希望的结果是得到姓名对应的值张三
,而运行之后的结果为 NULL
值。
select get_json_object('{ "abc姓名":"张三" , "abc":"18" }', '$.abc姓名');
我们希望的结果是得到姓名对应的值张三
,而运行之后的结果为 18
。
产生问题的原因
是什么原因导致的呢?我们查看 Hive 官网中 get_json_object
函数的介绍,可以发现 get_json_object
函数不能解析 json 里面中文的 key,如下图所示:
json 路径只能包含字符 [0-9a-z_]
,即不能包含 大写或特殊字符 。此外,键不能以数字开头。
那为什么 json 路径只能包含字符 [0-9a-z_]
呢?
通过查看源码我们发现 get_json_object
对应的 UDF 类的源码如下:
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.fasterxml.jackson.core.json.JsonReadFeature;
import com.fasterxml.jackson.databind.JavaType;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Iterators;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
/**
* UDFJson.
*/
@Description(name = "get_json_object",
value = "_FUNC_(json_txt, path) - Extract a json object from path ",
extended = "Extract json object from a json string based on json path "
+ "specified, and return json string of the extracted json object. It "
+ "will return null if the input json string is invalid.\n"
+ "A limited version of JSONPath supported:\n"
+ " $ : Root object\n"
+ " . : Child operator\n"
+ " [] : Subscript operator for array\n"
+ " * : Wildcard for []\n"
+ "Syntax not supported that's worth noticing:\n"
+ " '' : Zero length string as key\n"
+ " .. : Recursive descent\n"
+ " @ : Current object/element\n"
+ " () : Script expression\n"
+ " ?() : Filter (script) expression.\n"
+ " [,] : Union operator\n"
+ " [start:end:step] : array slice operator\n")
//定义了一个名为UDFJson的类,继承自UDF类。
public class UDFGetJsonObjectCN extends UDF {
//定义一个静态正则表达式模式,用于匹配JSON路径中的键。
//匹配英文key:匹配一个或多个大写字母、小写字母、数字、下划线、连字符、冒号或空格。
private static final Pattern patternKey = Pattern.compile("^([a-zA-Z0-9_\\-\\:\\s]+).*");
//定义一个静态正则表达式模式,用于匹配JSON路径中的索引。
private static final Pattern patternIndex = Pattern.compile("\\[([0-9]+|\\*)\\]");
//创建一个ObjectMapper对象,用于解析JSON字符串。
private static final ObjectMapper objectMapper = new ObjectMapper();
//创建一个JavaType对象,用于表示Map类型。
private static final JavaType MAP_TYPE = objectMapper.getTypeFactory().constructType(Map.class);
//创建一个JavaType对象,用于表示List类型。
private static final JavaType LIST_TYPE = objectMapper.getTypeFactory().constructType(List.class);
//静态代码块,用于配置ObjectMapper的一些特性。
static {
// Allows for unescaped ASCII control characters in JSON values
objectMapper.enable(JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS.mappedFeature());
// Enabled to accept quoting of all character backslash qooting mechanism
objectMapper.enable(JsonReadFeature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER.mappedFeature());
}
// An LRU cache using a linked hash map
//定义了一个静态内部类HashCache,用作LRU缓存。
static class HashCache<K, V> extends LinkedHashMap<K, V> {
private static final int CACHE_SIZE = 16;
private static final int INIT_SIZE = 32;
private static final float LOAD_FACTOR = 0.6f;
HashCache() {
super(INIT_SIZE, LOAD_FACTOR);
}
private static final long serialVersionUID = 1;
@Override
protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
return size() > CACHE_SIZE;
}
}
//声明了一个名为extractObjectCache的HashMap对象,用于缓存已提取的JSON对象。
Map<String, Object> extractObjectCache = new HashCache<String, Object>();
//声明了一个名为pathExprCache的HashMap对象,用于缓存已解析的JSON路径表达式。
Map<String, String[]> pathExprCache = new HashCache<String, String[]>();
//声明了一个名为indexListCache的HashMap对象,用于缓存已解析的JSON路径中的索引列表。
Map<String, ArrayList<String>> indexListCache =
new HashCache<String, ArrayList<String>>();
//声明了一个名为mKeyGroup1Cache的HashMap对象,用于缓存JSON路径中的键。
Map<String, String> mKeyGroup1Cache = new HashCache<String, String>();
//声明了一个名为mKeyMatchesCache的HashMap对象,用于缓存JSON路径中的键是否匹配的结果。
Map<String, Boolean> mKeyMatchesCache = new HashCache<String, Boolean>();
//构造函数,没有参数。
public UDFGetJsonObjectCN() {
}
/**
* Extract json object from a json string based on json path specified, and
* return json string of the extracted json object. It will return null if the
* input json string is invalid.
*
* A limited version of JSONPath supported: $ : Root object . : Child operator
* [] : Subscript operator for array * : Wildcard for []
*
* Syntax not supported that's worth noticing: '' : Zero length string as key
* .. : Recursive descent &#064; : Current object/element () : Script
* expression ?() : Filter (script) expression. [,] : Union operator
* [start:end:step] : array slice operator
*
* @param jsonString
* the json string.
* @param pathString
* the json path expression.
* @return json string or null when an error happens.
*/
//evaluate方法,用于提取指定路径的JSON对象并返回JSON字符串。
public Text evaluate(String jsonString, String pathString) {
if (jsonString == null || jsonString.isEmpty() || pathString == null
|| pathString.isEmpty() || pathString.charAt(0) != '$') {
return null;
}
int pathExprStart = 1;
boolean unknownType = pathString.equals("$");
boolean isRootArray = false;
if (pathString.length() > 1) {
if (pathString.charAt(1) == '[') {
pathExprStart = 0;
isRootArray = true;
} else if (pathString.charAt(1) == '.') {
isRootArray = pathString.length() > 2 && pathString.charAt(2) == '[';
} else {
return null;
}
}
// Cache pathExpr
String[] pathExpr = pathExprCache.get(pathString);
if (pathExpr == null) {
pathExpr = pathString.split("\\.", -1);
pathExprCache.put(pathString, pathExpr);
}
// Cache extractObject
Object extractObject = extractObjectCache.get(jsonString);
if (extractObject == null) {
if (unknownType) {
try {
extractObject = objectMapper.readValue(jsonString, LIST_TYPE);
} catch (Exception e) {
// Ignore exception
}
if (extractObject == null) {
try {
extractObject = objectMapper.readValue(jsonString, MAP_TYPE);
} catch (Exception e) {
return null;
}
}
} else {
JavaType javaType = isRootArray ? LIST_TYPE : MAP_TYPE;
try {
extractObject = objectMapper.readValue(jsonString, javaType);
} catch (Exception e) {
return null;
}
}
extractObjectCache.put(jsonString, extractObject);
}
for (int i = pathExprStart; i < pathExpr.length; i++) {
if (extractObject == null) {
return null;
}
extractObject = extract(extractObject, pathExpr[i], i == pathExprStart && isRootArray);
}
Text result = new Text();
if (extractObject instanceof Map || extractObject instanceof List) {
try {
result.set(objectMapper.writeValueAsString(extractObject));
} catch (Exception e) {
return null;
}
} else if (extractObject != null) {
result.set(extractObject.toString());
} else {
return null;
}
return result;
}
//extract方法,递归地提取JSON对象。
private Object extract(Object json, String path, boolean skipMapProc) {
// skip MAP processing for the first path element if root is array
if (!skipMapProc) {
// Cache patternkey.matcher(path).matches()
Matcher mKey = null;
Boolean mKeyMatches = mKeyMatchesCache.get(path);
if (mKeyMatches == null) {
mKey = patternKey.matcher(path);
mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE;
mKeyMatchesCache.put(path, mKeyMatches);
}
if (!mKeyMatches.booleanValue()) {
return null;
}
// Cache mkey.group(1)
String mKeyGroup1 = mKeyGroup1Cache.get(path);
if (mKeyGroup1 == null) {
if (mKey == null) {
mKey = patternKey.matcher(path);
mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE;
mKeyMatchesCache.put(path, mKeyMatches);
if (!mKeyMatches.booleanValue()) {
return null;
}
}
mKeyGroup1 = mKey.group(1);
mKeyGroup1Cache.put(path, mKeyGroup1);
}
json = extract_json_withkey(json, mKeyGroup1);
}
// Cache indexList
ArrayList<String> indexList = indexListCache.get(path);
if (indexList == null) {
Matcher mIndex = patternIndex.matcher(path);
indexList = new ArrayList<String>();
while (mIndex.find()) {
indexList.add(mIndex.group(1));
}
indexListCache.put(path, indexList);
}
if (indexList.size() > 0) {
json = extract_json_withindex(json, indexList);
}
return json;
}
//创建一个名为jsonList的AddingList对象,用于存储提取出来的JSON对象。
private transient AddingList jsonList = new AddingList();
//定义了一个静态内部类AddingList,继承自ArrayList<Object>,用于添加JSON对象到jsonList中。
private static class AddingList extends ArrayList<Object> {
private static final long serialVersionUID = 1L;
@Override
public Iterator<Object> iterator() {
return Iterators.forArray(toArray());
}
@Override
public void removeRange(int fromIndex, int toIndex) {
super.removeRange(fromIndex, toIndex);
}
};
//extract_json_withindex方法,根据JSON路径中的索引提取JSON对象。
@SuppressWarnings("unchecked")
private Object extract_json_withindex(Object json, ArrayList<String> indexList) {
jsonList.clear();
jsonList.add(json);
for (String index : indexList) {
int targets = jsonList.size();
if (index.equalsIgnoreCase("*")) {
for (Object array : jsonList) {
if (array instanceof List) {
for (int j = 0; j < ((List<Object>)array).size(); j++) {
jsonList.add(((List<Object>)array).get(j));
}
}
}
} else {
for (Object array : jsonList) {
int indexValue = Integer.parseInt(index);
if (!(array instanceof List)) {
continue;
}
List<Object> list = (List<Object>) array;
if (indexValue >= list.size()) {
continue;
}
jsonList.add(list.get(indexValue));
}
}
if (jsonList.size() == targets) {
return null;
}
jsonList.removeRange(0, targets);
}
if (jsonList.isEmpty()) {
return null;
}
return (jsonList.size() > 1) ? new ArrayList<Object>(jsonList) : jsonList.get(0);
}
//extract_json_withkey方法,根据JSON路径中的键提取JSON对象。
@SuppressWarnings("unchecked")
private Object extract_json_withkey(Object json, String path) {
if (json instanceof List) {
List<Object> jsonArray = new ArrayList<Object>();
for (int i = 0; i < ((List<Object>) json).size(); i++) {
Object json_elem = ((List<Object>) json).get(i);
Object json_obj = null;
if (json_elem instanceof Map) {
json_obj = ((Map<String, Object>) json_elem).get(path);
} else {
continue;
}
if (json_obj instanceof List) {
for (int j = 0; j < ((List<Object>) json_obj).size(); j++) {
jsonArray.add(((List<Object>) json_obj).get(j));
}
} else if (json_obj != null) {
jsonArray.add(json_obj);
}
}
return (jsonArray.size() == 0) ? null : jsonArray;
} else if (json instanceof Map) {
return ((Map<String, Object>) json).get(path);
} else {
return null;
}
}
}
代码做了一些注释,我们可以发现 private final Pattern patternKey = Pattern.compile("^([a-zA-Z0-9_\\-\\:\\s]+).*");
这个就是匹配 key 的模式串,它的意思是匹配以数字、字母、_、-、:、空格 为开头的字符串。那么这个匹配模式串就决定了,get_json_object
函数无法匹配出 key 中带中文的键值对,即select get_json_object('{ "姓名":"张三" , "年龄":"18" }', '$.姓名');
结果为 null;而 select get_json_object('{ "abc姓名":"张三" , "abc":"18" }', '$.abc姓名');
中只能匹配以数字、字母、_、-、:、空格 为开头的字符串,所以将 abc姓名
中的 abc
当作 key 去取 value 值,所以得到的值为18。
解决办法
知道问题的原因了,那么我们怎么解决这个问题呢,其实很简单,我们只需要修改代码中匹配 key 的正则表达式就可以了。
其实我们可以将 get_json_object
函数的源码拿出来重新写一个 UDF 函数就可以了。
Hive-2.1.1 版本
需要注意自己 Hive 的版本,我们以 Hive-2.1.1 版本为例,代码如下:
package com.yan.hive.udf;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.collect.Iterators;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
import org.codehaus.jackson.JsonFactory;
import org.codehaus.jackson.JsonParser.Feature;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.map.type.TypeFactory;
import org.codehaus.jackson.type.JavaType;
/**
* UDFJson.
*
*/
@Description(name = "get_json_object_cn",
value = "_FUNC_(json_txt, path) - Extract a json object from path ",
extended = "Extract json object from a json string based on json path "
+ "specified, and return json string of the extracted json object. It "
+ "will return null if the input json string is invalid.\n"
+ "A limited version of JSONPath supported:\n"
+ " $ : Root object\n"
+ " . : Child operator\n"
+ " [] : Subscript operator for array\n"
+ " * : Wildcard for []\n"
+ "Syntax not supported that's worth noticing:\n"
+ " '' : Zero length string as key\n"
+ " .. : Recursive descent\n"
+ " &#064; : Current object/element\n"
+ " () : Script expression\n"
+ " ?() : Filter (script) expression.\n"
+ " [,] : Union operator\n"
+ " [start:end:step] : array slice operator\n")
public class UDFGetJsonObjectCN extends UDF {
//private final Pattern patternKey = Pattern.compile("^([a-zA-Z0-9_\\-\\:\\s]+).*");
private final Pattern patternKey = Pattern.compile("^([^\\[\\]]+).*");
private final Pattern patternIndex = Pattern.compile("\\[([0-9]+|\\*)\\]");
private static final JsonFactory JSON_FACTORY = new JsonFactory();
static {
// Allows for unescaped ASCII control characters in JSON values
JSON_FACTORY.enable(Feature.ALLOW_UNQUOTED_CONTROL_CHARS);
// Enabled to accept quoting of all character backslash qooting mechanism
JSON_FACTORY.enable(Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER);
}
private static final ObjectMapper MAPPER = new ObjectMapper(JSON_FACTORY);
private static final JavaType MAP_TYPE = TypeFactory.fromClass(Map.class);
private static final JavaType LIST_TYPE = TypeFactory.fromClass(List.class);
// An LRU cache using a linked hash map
static class HashCache<K, V> extends LinkedHashMap<K, V> {
private static final int CACHE_SIZE = 16;
private static final int INIT_SIZE = 32;
private static final float LOAD_FACTOR = 0.6f;
HashCache() {
super(INIT_SIZE, LOAD_FACTOR);
}
private static final long serialVersionUID = 1;
@Override
protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
return size() > CACHE_SIZE;
}
}
static Map<String, Object> extractObjectCache = new HashCache<String, Object>();
static Map<String, String[]> pathExprCache = new HashCache<String, String[]>();
static Map<String, ArrayList<String>> indexListCache =
new HashCache<String, ArrayList<String>>();
static Map<String, String> mKeyGroup1Cache = new HashCache<String, String>();
static Map<String, Boolean> mKeyMatchesCache = new HashCache<String, Boolean>();
Text result = new Text();
public UDFGetJsonObjectCN() {
}
/**
* Extract json object from a json string based on json path specified, and
* return json string of the extracted json object. It will return null if the
* input json string is invalid.
*
* A limited version of JSONPath supported: $ : Root object . : Child operator
* [] : Subscript operator for array * : Wildcard for []
*
* Syntax not supported that's worth noticing: '' : Zero length string as key
* .. : Recursive descent &#064; : Current object/element () : Script
* expression ?() : Filter (script) expression. [,] : Union operator
* [start:end:step] : array slice operator
*
* @param jsonString
* the json string.
* @param pathString
* the json path expression.
* @return json string or null when an error happens.
*/
public Text evaluate(String jsonString, String pathString) {
if (jsonString == null || jsonString.isEmpty() || pathString == null
|| pathString.isEmpty() || pathString.charAt(0) != '$') {
return null;
}
int pathExprStart = 1;
boolean isRootArray = false;
if (pathString.length() > 1) {
if (pathString.charAt(1) == '[') {
pathExprStart = 0;
isRootArray = true;
} else if (pathString.charAt(1) == '.') {
isRootArray = pathString.length() > 2 && pathString.charAt(2) == '[';
} else {
return null;
}
}
// Cache pathExpr
String[] pathExpr = pathExprCache.get(pathString);
if (pathExpr == null) {
pathExpr = pathString.split("\\.", -1);
pathExprCache.put(pathString, pathExpr);
}
// Cache extractObject
Object extractObject = extractObjectCache.get(jsonString);
if (extractObject == null) {
JavaType javaType = isRootArray ? LIST_TYPE : MAP_TYPE;
try {
extractObject = MAPPER.readValue(jsonString, javaType);
} catch (Exception e) {
return null;
}
extractObjectCache.put(jsonString, extractObject);
}
for (int i = pathExprStart; i < pathExpr.length; i++) {
if (extractObject == null) {
return null;
}
extractObject = extract(extractObject, pathExpr[i], i == pathExprStart && isRootArray);
}
if (extractObject instanceof Map || extractObject instanceof List) {
try {
result.set(MAPPER.writeValueAsString(extractObject));
} catch (Exception e) {
return null;
}
} else if (extractObject != null) {
result.set(extractObject.toString());
} else {
return null;
}
return result;
}
private Object extract(Object json, String path, boolean skipMapProc) {
// skip MAP processing for the first path element if root is array
if (!skipMapProc) {
// Cache patternkey.matcher(path).matches()
Matcher mKey = null;
Boolean mKeyMatches = mKeyMatchesCache.get(path);
if (mKeyMatches == null) {
mKey = patternKey.matcher(path);
mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE;
mKeyMatchesCache.put(path, mKeyMatches);
}
if (!mKeyMatches.booleanValue()) {
return null;
}
// Cache mkey.group(1)
String mKeyGroup1 = mKeyGroup1Cache.get(path);
if (mKeyGroup1 == null) {
if (mKey == null) {
mKey = patternKey.matcher(path);
mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE;
mKeyMatchesCache.put(path, mKeyMatches);
if (!mKeyMatches.booleanValue()) {
return null;
}
}
mKeyGroup1 = mKey.group(1);
mKeyGroup1Cache.put(path, mKeyGroup1);
}
json = extract_json_withkey(json, mKeyGroup1);
}
// Cache indexList
ArrayList<String> indexList = indexListCache.get(path);
if (indexList == null) {
Matcher mIndex = patternIndex.matcher(path);
indexList = new ArrayList<String>();
while (mIndex.find()) {
indexList.add(mIndex.group(1));
}
indexListCache.put(path, indexList);
}
if (indexList.size() > 0) {
json = extract_json_withindex(json, indexList);
}
return json;
}
private transient AddingList jsonList = new AddingList();
private static class AddingList extends ArrayList<Object> {
@Override
public Iterator<Object> iterator() {
return Iterators.forArray(toArray());
}
@Override
public void removeRange(int fromIndex, int toIndex) {
super.removeRange(fromIndex, toIndex);
}
};
@SuppressWarnings("unchecked")
private Object extract_json_withindex(Object json, ArrayList<String> indexList) {
jsonList.clear();
jsonList.add(json);
for (String index : indexList) {
int targets = jsonList.size();
if (index.equalsIgnoreCase("*")) {
for (Object array : jsonList) {
if (array instanceof List) {
for (int j = 0; j < ((List<Object>)array).size(); j++) {
jsonList.add(((List<Object>)array).get(j));
}
}
}
} else {
for (Object array : jsonList) {
int indexValue = Integer.parseInt(index);
if (!(array instanceof List)) {
continue;
}
List<Object> list = (List<Object>) array;
if (indexValue >= list.size()) {
continue;
}
jsonList.add(list.get(indexValue));
}
}
if (jsonList.size() == targets) {
return null;
}
jsonList.removeRange(0, targets);
}
if (jsonList.isEmpty()) {
return null;
}
return (jsonList.size() > 1) ? new ArrayList<Object>(jsonList) : jsonList.get(0);
}
@SuppressWarnings("unchecked")
private Object extract_json_withkey(Object json, String path) {
if (json instanceof List) {
List<Object> jsonArray = new ArrayList<Object>();
for (int i = 0; i < ((List<Object>) json).size(); i++) {
Object json_elem = ((List<Object>) json).get(i);
Object json_obj = null;
if (json_elem instanceof Map) {
json_obj = ((Map<String, Object>) json_elem).get(path);
} else {
continue;
}
if (json_obj instanceof List) {
for (int j = 0; j < ((List<Object>) json_obj).size(); j++) {
jsonArray.add(((List<Object>) json_obj).get(j));
}
} else if (json_obj != null) {
jsonArray.add(json_obj);
}
}
return (jsonArray.size() == 0) ? null : jsonArray;
} else if (json instanceof Map) {
return ((Map<String, Object>) json).get(path);
} else {
return null;
}
}
}
需要导入的依赖,要和自己集群的版本契合,Hadoop 的版本及 Hive 的版本。
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.atguigu.hive</groupId>
<artifactId>hivetest</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<hadoop.version>3.0.0</hadoop.version>
<hive.version>2.1.1</hive.version>
<jackson.version>1.9.2</jackson.version>
<guava.version>14.0.1</guava.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hive/hive-jdbc -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>${hive.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${guava.version}</version>
</dependency>
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-core-asl</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-jaxrs</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-xc</artifactId>
<version>${jackson.version}</version>
</dependency>
</dependencies>
</project>
注意: 因为上述UDF中也用到了 com.google.guava
和 org.codehaus.jackson
,所以这两个依赖要和 hive 版本中所用的依赖版本一致。
Hive-4.0.0 版本
package com.yan.hive.udf;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.fasterxml.jackson.core.json.JsonReadFeature;
import com.fasterxml.jackson.databind.JavaType;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Iterators;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
/**
* @author Yan
* @create 2023-08-05 22:21
* hive解析json中文key
*/
@Description(name = "get_json_object_cn",
value = "_FUNC_(json_txt, path) - Extract a json object from path ",
extended = "Extract json object from a json string based on json path "
+ "specified, and return json string of the extracted json object. It "
+ "will return null if the input json string is invalid.\n"
+ "A limited version of JSONPath supported:\n"
+ " $ : Root object\n"
+ " . : Child operator\n"
+ " [] : Subscript operator for array\n"
+ " * : Wildcard for []\n"
+ "Syntax not supported that's worth noticing:\n"
+ " '' : Zero length string as key\n"
+ " .. : Recursive descent\n"
+ " &#064; : Current object/element\n"
+ " () : Script expression\n"
+ " ?() : Filter (script) expression.\n"
+ " [,] : Union operator\n"
+ " [start:end:step] : array slice operator\n")
//定义了一个名为UDFJson的类,继承自UDF类。
public class UDFGetJsonObjectCN extends UDF {
//定义一个静态正则表达式模式,用于匹配JSON路径中的键。
//匹配英文key:匹配一个或多个大写字母、小写字母、数字、下划线、连字符、冒号或空格。
//private static final Pattern patternKey = Pattern.compile("^([a-zA-Z0-9_\\-\\:\\s]+).*");
//可以匹配中文,\\p{L}来匹配任意Unicode字母字符,包括中文字符:英文、数字、下划线、连字符、冒号、空格和中文字符。
//private static final Pattern patternKey = Pattern.compile("^([a-zA-Z0-9_\\-\\:\\s\\p{L}]+).*");
//可以匹配中文,\\p{L}来匹配任意Unicode字母字符,包括中文字符,但不包含特殊字符,特殊字符需自己添加
//private static final Pattern patternKey = Pattern.compile("^([a-zA-Z0-9_\\-\\:\\s?%*+\\p{L}]+).*");
//可以匹配中文,包含特殊字符,但不包含英文下的点(.);还有就是匹配不到路径中的索引了
//private static final Pattern patternKey = Pattern.compile("^(.+).*");
//可以匹配中文,包含特殊字符,不包中括号"[]",但不包含英文下的点(.);这样就可以匹配路径中的索引了
private static final Pattern patternKey = Pattern.compile("^([^\\[\\]]+).*");
//定义一个静态正则表达式模式,用于匹配JSON路径中的索引。
private static final Pattern patternIndex = Pattern.compile("\\[([0-9]+|\\*)\\]");
//创建一个ObjectMapper对象,用于解析JSON字符串。
private static final ObjectMapper objectMapper = new ObjectMapper();
//创建一个JavaType对象,用于表示Map类型。
private static final JavaType MAP_TYPE = objectMapper.getTypeFactory().constructType(Map.class);
//创建一个JavaType对象,用于表示List类型。
private static final JavaType LIST_TYPE = objectMapper.getTypeFactory().constructType(List.class);
//静态代码块,用于配置ObjectMapper的一些特性。
static {
// Allows for unescaped ASCII control characters in JSON values
objectMapper.enable(JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS.mappedFeature());
// Enabled to accept quoting of all character backslash qooting mechanism
objectMapper.enable(JsonReadFeature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER.mappedFeature());
}
// An LRU cache using a linked hash map
//定义了一个静态内部类HashCache,用作LRU缓存。
static class HashCache<K, V> extends LinkedHashMap<K, V> {
private static final int CACHE_SIZE = 16;
private static final int INIT_SIZE = 32;
private static final float LOAD_FACTOR = 0.6f;
HashCache() {
super(INIT_SIZE, LOAD_FACTOR);
}
private static final long serialVersionUID = 1;
@Override
protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
return size() > CACHE_SIZE;
}
}
//声明了一个名为extractObjectCache的HashMap对象,用于缓存已提取的JSON对象。
Map<String, Object> extractObjectCache = new HashCache<String, Object>();
//声明了一个名为pathExprCache的HashMap对象,用于缓存已解析的JSON路径表达式。
Map<String, String[]> pathExprCache = new HashCache<String, String[]>();
//声明了一个名为indexListCache的HashMap对象,用于缓存已解析的JSON路径中的索引列表。
Map<String, ArrayList<String>> indexListCache =
new HashCache<String, ArrayList<String>>();
//声明了一个名为mKeyGroup1Cache的HashMap对象,用于缓存JSON路径中的键。
Map<String, String> mKeyGroup1Cache = new HashCache<String, String>();
//声明了一个名为mKeyMatchesCache的HashMap对象,用于缓存JSON路径中的键是否匹配的结果。
Map<String, Boolean> mKeyMatchesCache = new HashCache<String, Boolean>();
//构造函数,没有参数。
public UDFGetJsonObjectCN() {
}
/**
* Extract json object from a json string based on json path specified, and
* return json string of the extracted json object. It will return null if the
* input json string is invalid.
*
* A limited version of JSONPath supported: $ : Root object . : Child operator
* [] : Subscript operator for array * : Wildcard for []
*
* Syntax not supported that's worth noticing: '' : Zero length string as key
* .. : Recursive descent &#064; : Current object/element () : Script
* expression ?() : Filter (script) expression. [,] : Union operator
* [start:end:step] : array slice operator
*
* @param jsonString
* the json string.
* @param pathString
* the json path expression.
* @return json string or null when an error happens.
*/
//evaluate方法,用于提取指定路径的JSON对象并返回JSON字符串。
public Text evaluate(String jsonString, String pathString) {
if (jsonString == null || jsonString.isEmpty() || pathString == null
|| pathString.isEmpty() || pathString.charAt(0) != '$') {
return null;
}
int pathExprStart = 1;
boolean unknownType = pathString.equals("$");
boolean isRootArray = false;
if (pathString.length() > 1) {
if (pathString.charAt(1) == '[') {
pathExprStart = 0;
isRootArray = true;
} else if (pathString.charAt(1) == '.') {
isRootArray = pathString.length() > 2 && pathString.charAt(2) == '[';
} else {
return null;
}
}
// Cache pathExpr
String[] pathExpr = pathExprCache.get(pathString);
if (pathExpr == null) {
pathExpr = pathString.split("\\.", -1);
pathExprCache.put(pathString, pathExpr);
}
// Cache extractObject
Object extractObject = extractObjectCache.get(jsonString);
if (extractObject == null) {
if (unknownType) {
try {
extractObject = objectMapper.readValue(jsonString, LIST_TYPE);
} catch (Exception e) {
// Ignore exception
}
if (extractObject == null) {
try {
extractObject = objectMapper.readValue(jsonString, MAP_TYPE);
} catch (Exception e) {
return null;
}
}
} else {
JavaType javaType = isRootArray ? LIST_TYPE : MAP_TYPE;
try {
extractObject = objectMapper.readValue(jsonString, javaType);
} catch (Exception e) {
return null;
}
}
extractObjectCache.put(jsonString, extractObject);
}
for (int i = pathExprStart; i < pathExpr.length; i++) {
if (extractObject == null) {
return null;
}
extractObject = extract(extractObject, pathExpr[i], i == pathExprStart && isRootArray);
}
Text result = new Text();
if (extractObject instanceof Map || extractObject instanceof List) {
try {
result.set(objectMapper.writeValueAsString(extractObject));
} catch (Exception e) {
return null;
}
} else if (extractObject != null) {
result.set(extractObject.toString());
} else {
return null;
}
return result;
}
//extract方法,递归地提取JSON对象。
private Object extract(Object json, String path, boolean skipMapProc) {
// skip MAP processing for the first path element if root is array
if (!skipMapProc) {
// Cache patternkey.matcher(path).matches()
Matcher mKey = null;
Boolean mKeyMatches = mKeyMatchesCache.get(path);
if (mKeyMatches == null) {
mKey = patternKey.matcher(path);
mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE;
mKeyMatchesCache.put(path, mKeyMatches);
}
if (!mKeyMatches.booleanValue()) {
return null;
}
// Cache mkey.group(1)
String mKeyGroup1 = mKeyGroup1Cache.get(path);
if (mKeyGroup1 == null) {
if (mKey == null) {
mKey = patternKey.matcher(path);
mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE;
mKeyMatchesCache.put(path, mKeyMatches);
if (!mKeyMatches.booleanValue()) {
return null;
}
}
mKeyGroup1 = mKey.group(1);
mKeyGroup1Cache.put(path, mKeyGroup1);
}
json = extract_json_withkey(json, mKeyGroup1);
}
// Cache indexList
ArrayList<String> indexList = indexListCache.get(path);
if (indexList == null) {
Matcher mIndex = patternIndex.matcher(path);
indexList = new ArrayList<String>();
while (mIndex.find()) {
indexList.add(mIndex.group(1));
}
indexListCache.put(path, indexList);
}
if (indexList.size() > 0) {
json = extract_json_withindex(json, indexList);
}
return json;
}
//创建一个名为jsonList的AddingList对象,用于存储提取出来的JSON对象。
private transient AddingList jsonList = new AddingList();
//定义了一个静态内部类AddingList,继承自ArrayList<Object>,用于添加JSON对象到jsonList中。
private static class AddingList extends ArrayList<Object> {
private static final long serialVersionUID = 1L;
@Override
public Iterator<Object> iterator() {
return Iterators.forArray(toArray());
}
@Override
public void removeRange(int fromIndex, int toIndex) {
super.removeRange(fromIndex, toIndex);
}
};
//extract_json_withindex方法,根据JSON路径中的索引提取JSON对象。
@SuppressWarnings("unchecked")
private Object extract_json_withindex(Object json, ArrayList<String> indexList) {
jsonList.clear();
jsonList.add(json);
for (String index : indexList) {
int targets = jsonList.size();
if (index.equalsIgnoreCase("*")) {
for (Object array : jsonList) {
if (array instanceof List) {
for (int j = 0; j < ((List<Object>)array).size(); j++) {
jsonList.add(((List<Object>)array).get(j));
}
}
}
} else {
for (Object array : jsonList) {
int indexValue = Integer.parseInt(index);
if (!(array instanceof List)) {
continue;
}
List<Object> list = (List<Object>) array;
if (indexValue >= list.size()) {
continue;
}
jsonList.add(list.get(indexValue));
}
}
if (jsonList.size() == targets) {
return null;
}
jsonList.removeRange(0, targets);
}
if (jsonList.isEmpty()) {
return null;
}
return (jsonList.size() > 1) ? new ArrayList<Object>(jsonList) : jsonList.get(0);
}
//extract_json_withkey方法,根据JSON路径中的键提取JSON对象。
@SuppressWarnings("unchecked")
private Object extract_json_withkey(Object json, String path) {
if (json instanceof List) {
List<Object> jsonArray = new ArrayList<Object>();
for (int i = 0; i < ((List<Object>) json).size(); i++) {
Object json_elem = ((List<Object>) json).get(i);
Object json_obj = null;
if (json_elem instanceof Map) {
json_obj = ((Map<String, Object>) json_elem).get(path);
} else {
continue;
}
if (json_obj instanceof List) {
for (int j = 0; j < ((List<Object>) json_obj).size(); j++) {
jsonArray.add(((List<Object>) json_obj).get(j));
}
} else if (json_obj != null) {
jsonArray.add(json_obj);
}
}
return (jsonArray.size() == 0) ? null : jsonArray;
} else if (json instanceof Map) {
return ((Map<String, Object>) json).get(path);
} else {
return null;
}
}
}
依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.atguigu.hive</groupId>
<artifactId>hivetest</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<hadoop.version>3.3.1</hadoop.version>
<hive.version>4.0.0</hive.version>
<jackson.version>2.13.5</jackson.version>
<guava.version>22.0</guava.version>
</properties>
<dependencies>
<dependency>
<groupId>com.fasterxml.jackson</groupId>
<artifactId>jackson-bom</artifactId>
<version>${jackson.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${guava.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>${hive.version}</version>
</dependency>
</dependencies>
</project>
Hive 版本不同,之间的依赖可能就有些许差距,如果不注意的话可能会报依赖错误。
参考文章:
get_json_object不能解析json里面中文的key
get_json_object源码
impala&hive自定义UDF解析json中文key