#gStore-weekly | gAnswer源码分析：后处理

gAnswer通过自然语言问题转化成查询图，然后再和图数据库中的RDF图做匹配以生成用于查询的SPARQL语句。在将SPARQL语句应用于gStore查询之前还需要进行修复和聚合，以及一些后处理工作，本文聚焦于此。

// step 0: Node (entity & type & literal) Recognition 
// step 1: question parsing (dependency tree, sentence type)
// step 2: build query graph (structure construction, relation extraction, top-k join)

// step 3: some fix (such as "one-node" or "ask-one-triple") and aggregation
t = System.currentTimeMillis();
AddtionalFix step3 = new AddtionalFix();
step3.process(qlog);

在前几期关于gAnswer的文章中，我们完成了算法前三步的解析，认识了依存分析，节点提取，关系提取，进一步的查询图生成，子图匹配等模块。上面是第四步修复与聚合的入口函数，注释中，举了两个例子，"one-node"单节点查询和"ask-one-triple"，之后都会有具体方法的解析。

public HashMap<String, String> pattern2category = new HashMap<String, String>();

public AddtionalFix()
{
    // Some category mappings for DBpedia, try automatic linking methods later. | base form
    pattern2category.put("gangster_from_the_prohibition_era", "Prohibition-era_gangsters");
    pattern2category.put("seven_wonder_of_the_ancient_world", "Seven_Wonders_of_the_Ancient_World");
    pattern2category.put("three_ship_use_by_columbus", "Christopher_Columbus");
    pattern2category.put("13_british_colony", "Thirteen_Colonies");
}

首先在 AddtionalFix
类内部创建了一个名为 pattern2category
的哈希映射，用于将查询模式映射到类别。

public void process(QueryLogger qlog)
{
    fixCategory(qlog);
    oneTriple(qlog);
    oneNode(qlog);
    
    //aggregation
    AggregationRecognition ar = new AggregationRecognition();
    ar.recognize(qlog);

    //query type
    decideQueryType(qlog);
}

主方法process
接受了 QueryLogger
对象 qlog
作为参数。在该方法中，依次调用了以下三个方法：fixCategory
、oneTriple
和 oneNode
。这是完成fix的三个方法，然后调用 ar.recognize(qlog)
来进行聚合识别。以及调用了 decideQueryType(qlog)
来确定查询的类型。

public void fixCategory(QueryLogger qlog)
{
    if(qlog == null || qlog.semanticUnitList == null)
       return;
    
    String var = null, category = null;
    for(SemanticUnit su: qlog.semanticUnitList)
    {
       if(su.centerWord.mayCategory)
       {
          var = "?"+su.centerWord.originalForm;
          category = su.centerWord.category;
       }
    }
    
    if(category != null && var != null)
       for(Sparql spq: qlog.rankedSparqls)
       {
          boolean occured = false;
          for(Triple tri: spq.tripleList)
          {
             if(tri.subject.equals(var))
             {
                occured = true;
                break;
             }
          }
          String oName = category;
          String pName = "subject";
          int pid = Globals.pd.predicate_2_id.get(pName);
          Triple triple =    new Triple(Triple.VAR_ROLE_ID, var, pid, Triple.CAT_ROLE_ID, oName, null, 100);
          spq.addTriple(triple);
       }
}

fixCategory
方法用于修复查询中的类别信息。

遍历 qlog.semanticUnitList
中的每个语义单元 su
，检查其中心词 centerWord
是否具有可能的类别信息（mayCategory
标志）。如果有，将中心词的原始形式 originalForm
作为变量 var
，将类别信息 category
赋给 category
。
如果 category
和 var
在上一步的遍历中得到赋值，遍历 qlog.rankedSparqls
中的每个 Sparql
对象 spq
，并检查是否已经存在相同变量的三元组。如果不存在相同变量的三元组，将类别信息添加到查询中作为一个新的三元组。

public void oneNode(QueryLogger qlog)
{
    if(qlog == null || qlog.semanticUnitList == null || qlog.semanticUnitList.size()>1)
       return;
    
    Word target = qlog.target;
    Word[] words = qlog.s.words;
    if(qlog.s.sentenceType != SentenceType.GeneralQuestion)
    {
       //1-1: how many [type] are there | List all [type]
       if(target.mayType && target.tmList != null)
       {
          String subName = "?"+target.originalForm;
          String typeName = target.tmList.get(0).typeName;
          Triple triple =    new Triple(Triple.VAR_ROLE_ID, subName, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, typeName, null, 100);
          Sparql sparql = new Sparql();
          sparql.addTriple(triple);
          qlog.rankedSparqls.add(sparql);
       }
       //1-2: What is [ent]?
       else if(target.mayEnt && target.emList != null)
       {
          if(words.length >= 3 && words[0].baseForm.equals("what") && words[1].baseForm.equals("be"))
          {
             int eid = target.emList.get(0).entityID;
             String subName = target.emList.get(0).entityName;
             Triple triple =    new Triple(eid, subName, Globals.pd.typePredicateID, Triple.VAR_ROLE_ID, "?"+target.originalForm, null, target.emList.get(0).score);
             Sparql sparql = new Sparql();
             sparql.addTriple(triple);
             qlog.rankedSparqls.add(sparql);
          }
       }
       //1-3: Give me all Seven Wonders of the Ancient World.
       else if(target.mayCategory && target.category != null)
       {
          String oName = target.category;
          String pName = "subject";
          int pid = Globals.pd.predicate_2_id.get(pName);
          Triple triple =    new Triple(Triple.VAR_ROLE_ID, "?"+target.originalForm, pid, Triple.CAT_ROLE_ID, oName, null, 100);
          Sparql sparql = new Sparql();
          sparql.addTriple(triple);
          qlog.rankedSparqls.add(sparql);
       }
    }
    else 
    {
       if(target.mayEnt && target.emList != null)
       {
          //2-2：Was Sigmund Freud married?
          String relMention = "";
          for(Word word: words)
             if(word != target && !word.baseForm.equals(".") && !word.baseForm.equals("?"))
                relMention += word.baseForm+" ";
          if(relMention.length() > 1)
             relMention = relMention.substring(0, relMention.length()-1);
          
          ArrayList<PredicateIDAndSupport> pmList = null;
          if(Globals.pd.nlPattern_2_predicateList.containsKey(relMention))
             pmList = Globals.pd.nlPattern_2_predicateList.get(relMention);
          
          if(pmList != null && pmList.size() > 0)
          {
             int pid = pmList.get(0).predicateID;
             int eid = target.emList.get(0).entityID;
             String subName = target.emList.get(0).entityName;
             Triple triple =    new Triple(eid, subName, pid, Triple.VAR_ROLE_ID, "?x", null, 100);
             Sparql sparql = new Sparql();
             sparql.addTriple(triple);
             qlog.rankedSparqls.add(sparql);
          }
    
          //2-3：Are penguins endangered?
          else
          {
             if(target.position < words.length && pattern2category.containsKey(words[target.position].baseForm))
             {
                String oName = pattern2category.get(words[target.position].baseForm);
                String pName = "subject";
                int pid = Globals.pd.predicate_2_id.get(pName);
                int eid = target.emList.get(0).entityID;
                String subName = target.emList.get(0).entityName;
                Triple triple =    new Triple(eid, subName, pid, Triple.CAT_ROLE_ID, oName, null, 100);
                Sparql sparql = new Sparql();
                sparql.addTriple(triple);
                qlog.rankedSparqls.add(sparql);
             }
          }
       }
       //2-1: Are there any [castles_in_the_United_States](yago:type)
       else if(target.mayType && target.tmList != null)
       {
          String typeName = target.tmList.get(0).typeName;
          String subName = "?" + target.originalForm;
          //System.out.println("typeName="+typeName+" subName="+subName);
          Triple triple =    new Triple(Triple.VAR_ROLE_ID, subName, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, typeName, null, 100);
          Sparql sparql = new Sparql();
          sparql.addTriple(triple);
          qlog.rankedSparqls.add(sparql);
       }
    }
}

关于代码中用于识别单节点查询（one-Node query）的逻辑，根据不同情况分成了两大类和六种具体情况：

第一大类：特殊问题（Special question）和祈使句（Imperative sentence），它会处理包含一个节点的查询，并根据不同的情况生成相应的查询三元组，并将其添加到 rankedSparqls 列表中。
- 1-1："how many [type] are there" 和 "list all [type]" 这样的问题，首先检查识别的目标词target
  是否可能是一个类型type
  。创建一个三元组，其中实体是变量 subName
  ，谓词是全局定义的表示类型关系的谓词ID（Globals.pd.typePredicateID
  ），宾语是类型名称。
- 1-2："What is backgammon?" 和 "What is a bipolar syndrome?" 这样的问题，首先检查识别的目标词target
  是否可能是一个实体entity
  。创建一个三元组，其中实体是变量 subName
  ，谓词是全局定义的表示类型关系的谓词ID（Globals.pd.typePredicateID
  ），宾语是用户查询中的实体描述，以 "?" 加上实体描述（"?"+target.originalForm
  ）。
- 1-3："Give me all Seven Wonders of the Ancient World." 这样的问题，首先检查识别的目标词target
  是否可能是一个类别category
  。创建一个三元组，其中实体是变量 "?"+target.originalForm
  ，谓词是特定分类对应的谓词ID，宾语是用户查询中的特定分类（oName
  ）。
第二大类：一般问题（General question），根据目标词target
是否可能是实体mayEnt
和是否有实体列表emList
，进行不同的处理。
- 2-1："Are there any [castles_in_the_United_States]"这样的问题，首先检查识别的目标词target
  是否可能是一个类型type
  ，需要检查特定类型的实体是否存在。创建一个三元组，主语是一个变量（由 subName
  指定），谓词是一个特定的谓词（由 Globals.pd.typePredicateID
  指定），宾语是一个特定实体的类型（由 typeName
  指定）。
- 2-2："Was Sigmund Freud married?" 这样的问题，首先检查识别的目标词target
  是否可能是一个实体entity
  ，用户查询可能是关于特定实体的事实。创建一个三元组，其中实体是变量 ?x
  ，谓词是获取的谓词，宾语是实体名。
- 2-3："Are penguins endangered?" 这样的问题，首先检查识别的目标词target
  是否可能是一个实体entity
  ，用户可能在询问特定实体的类别。创建一个三元组，其中实体是实体名，谓词是获取的谓词，宾语是类别名。

这些情况将影响代码中单节点查询的处理方式。

public void oneTriple (QueryLogger qlog)
 {
  if(qlog == null || qlog.semanticUnitList == null)
   return;
  
  if(qlog.s.sentenceType == SentenceType.SpecialQuestion)
  {
   Word[] words = qlog.s.words;
   if(qlog.semanticUnitList.size() == 2)
   {
    Word entWord = null, whWord = null;
    for(int i=0;i<qlog.semanticUnitList.size();i++)
    {
   if(qlog.semanticUnitList.get(i).centerWord.baseForm.startsWith("wh"))
      whWord = qlog.semanticUnitList.get(i).centerWord;
     if(qlog.semanticUnitList.get(i).centerWord.mayEnt)
      entWord = qlog.semanticUnitList.get(i).centerWord;
    }
    // 1-1: (what) is [ent] | we guess users may want the type of ent.
    if(entWord!=null && whWord!= null && words.length >= 3 && words[0].baseForm.equals("what") && words[1].baseForm.equals("be"))
    {
     int eid = entWord.emList.get(0).entityID;
     String subName = entWord.emList.get(0).entityName;
     Triple triple = new Triple(eid, subName, Globals.pd.typePredicateID, Triple.VAR_ROLE_ID, "?"+whWord.originalForm, null, entWord.emList.get(0).score);
     Sparql sparql = new Sparql();
     sparql.addTriple(triple);
     qlog.rankedSparqls.add(sparql);
    }
   }
  }
 }
}

oneTriple
方法用于处理在句子中能识别三元组但没有合适关系的情况。

检查句子类型是否为特殊问题（SentenceType.SpecialQuestion
）。如果是，继续检查是否识别出了两个语义单元（semanticUnitList.size() == 2
）。
如果符合条件，尝试构建一个三元组。这里主要处理了一种情况：
- 遍历语义单元，根据语义单元的属性，识别实体词和疑问词。
- 类似 "What is [ent]?" 这样的问题。根据识别到的实体词（entWord
  ）和疑问词（whWord
  ），构建一个以实体为主语、类型为谓词、疑问词为宾语的三元组，然后将这个三元组添加到 SPARQL 查询列表（qlog.rankedSparqls
  ）中。

// deduplicate in SPARQL
    for(Sparql spq: rankedSparqls)
       spq.deduplicate();
    
    // Sort (descending order).
    Collections.sort(rankedSparqls);
    qlog.rankedSparqls = rankedSparqls;
    System.out.println("number of rankedSparqls = " + qlog.rankedSparqls.size());
    
    // Detect question focus.
    for (int i=0; i<qlog.rankedSparqls.size(); i++) 
    {
       // First detect by SPARQLs.
       Sparql spq = qlog.rankedSparqls.get(i);
       String questionFocus = QuestionParsing.detectQuestionFocus(spq);
       
       // If failed, use TARGET directly.
       if(questionFocus == null)
          questionFocus = "?"+qlog.target.originalForm;
       
       spq.questionFocus = questionFocus;
    }
             
    return qlog;
}

最后，将得到的SPARQLS查询列表进行去重、排序和问题焦点的检测等后处理。返回包含处理后的信息的 QueryLogger
对象。