中文分词模拟器【华为OD机试-JAVAPythonC++JS】

题目描述

给定一个连续不包含空格字符串，该字符串仅包含英文小写字母及英文文标点符号(逗号、分号、句号)，同时给定词库，对该字符串进行精确分词。
说明：
1.精确分词：字符串分词后，不会出现重叠。即“ilovechina” ，不同词库可分割为 “i，love，china” “ilove，china”，不能分割出现重叠的"i，ilove，china",i重叠出现
2.标点符号不成词，仅用于断句
3.词库：根据外部知识库统计出来的常用词汇例：dictionary=[“i”,“love”,“china”,“lovechina”,“ilove”],
4.分词原则：采用分词顺序优先且最长匹配原则
“ilovechina”，假设分词结果 [ i,ilove,lo,love,ch,china,lovechina ] 则输出 [ilove，china]
错误输出：[i,lovechina], 原因："ilove ">优先于 "lovechina"成词
错误输出：[i,love,china] 原因：“ilove” >“i” 遵循最长匹配原则
输入描述：
字符串长度限制：0<length<256
词库长度限制： 1<length<100000
第一行输入待分词语句 “ilovechina”
第二行输入中文词库 “i,love,china,ch,na,ve,lo,this,is,the,word”
输出描述：
按顺序输出分词结果 “i,love,china”
补充说明：
示例1
输入：
ilovechina
i,love,china,ch,na,ve,lo,this,is,the,word
输出：
i,love,china
说明：
示例2
输入：
iat
i,love,china,ch,na,ve,lo,this,is,the,word,beauti,tiful,ful
输出：
i,a,t
说明：
单个字母，不在词库中且不成词则直接输出单个字母
示例3
输入：
ilovechina,thewordisbeautiful
i,love,china,ch,na,ve,lo,this,is,the,word,beauti,tiful,ful
输出：
i,love,china,the,word,is,beauti,ful
说明：
标点符号为英文标点符号

解题思路

首先，构建一个字典树（Trie）来存储词库中的所有词汇，以便能够高效地进行匹配。
对输入的字符串进行遍历，从左到右依次匹配字典树中的词汇。
在匹配过程中，遵循最长匹配原则，即尽量匹配更长的词汇。
如果当前位置能够匹配一个词汇，将该词汇加入结果中，并将匹配位置移动到词汇的末尾。
继续遍历，重复步骤3和步骤4，直至整个字符串被匹配完毕。
输出结果即为分词后的字符串。

通过以上步骤，可以保证在满足分词顺序和最长匹配原则的前提下，得到正确的分词结果。在实现时，可以使用递归或迭代的方式来遍历字符串并匹配词汇。同时，为了提高效率，可以使用动态规划或其他优化方法来避免重复计算。

题解代码

Python题解代码

def word_segmentation(s, dictionary):
    n = len(s)
    dp = [False] * (n + 1)
    dp[0] = True
    result = []

    for i in range(1, n + 1):
        for j in range(i - 1, -1, -1):
            if dp[j] and s[j:i] in dictionary:
                dp[i] = True
                break

    if not dp[n]:
        return [s[i] for i in range(n)]

    i = n
    while i > 0:
        for j in range(i - 1, -1, -1):
            if dp[j] and s[j:i] in dictionary:
                result.insert(0, s[j:i])
                i = j
                break

    return result


if __name__ == "__main__":
    input_string = input().strip()
    dictionary = input().strip().split(',')

    result = word_segmentation(input_string, dictionary)
    print(','.join(result))

JAVA题解代码

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Scanner;

public class WordSegmentation {

    public static List<String> wordSegmentation(String s, List<String> dictionary) {
        int n = s.length();
        boolean[] dp = new boolean[n + 1];
        dp[0] = true;
        List<String> result = new ArrayList<>();

        for (int i = 1; i <= n; i++) {
            for (int j = i - 1; j >= 0; j--) {
                if (dp[j] && dictionary.contains(s.substring(j, i))) {
                    dp[i] = true;
                    break;
                }
            }
        }

        if (!dp[n]) {
            for (int i = 0; i < n; i++) {
                result.add(String.valueOf(s.charAt(i)));
            }
            return result;
        }

        int i = n;
        while (i > 0) {
            for (int j = i - 1; j >= 0; j--) {
                if (dp[j] && dictionary.contains(s.substring(j, i))) {
                    result.add(0, s.substring(j, i));
                    i = j;
                    break;
                }
            }
        }

        return result;
    }

    public static void main(String[] args) {
        Scanner scanner = new Scanner(System.in);

        String inputString = scanner.nextLine().trim();
        String[] dictionaryArray = scanner.nextLine().trim().split(",");
        List<String> dictionary = Arrays.asList(dictionaryArray);

        List<String> result = wordSegmentation(inputString, dictionary);
        System.out.println(String.join(",", result));
    }
}

C/C++题解代码

#include <iostream>
#include <vector>
#include <sstream>

using namespace std;

vector<string> wordSegmentation(const string& s, const vector<string>& dictionary) {
    int n = s.length();
    vector<bool> dp(n + 1, false);
    dp[0] = true;
    vector<string> result;

    for (int i = 1; i <= n; i++) {
        for (int j = i - 1; j >= 0; j--) {
            if (dp[j] && find(dictionary.begin(), dictionary.end(), s.substr(j, i - j)) != dictionary.end()) {
                dp[i] = true;
                break;
            }
        }
    }

    if (!dp[n]) {
        for (int i = 0; i < n; i++) {
            result.push_back(string(1, s[i]));
        }
        return result;
    }

    int i = n;
    while (i > 0) {
        for (int j = i - 1; j >= 0; j--) {
            if (dp[j] && find(dictionary.begin(), dictionary.end(), s.substr(j, i - j)) != dictionary.end()) {
                result.insert(result.begin(), s.substr(j, i - j));
                i = j;
                break;
            }
        }
    }

    return result;
}

int main() {
    string inputString, dictionaryString;
    getline(cin, inputString);
    getline(cin, dictionaryString);

    vector<string> dictionary;
    stringstream ss(dictionaryString);
    string word;
    while (getline(ss, word, ',')) {
        dictionary.push_back(word);
    }

    vector<string> result = wordSegmentation(inputString, dictionary);
    cout << result[0];
    for (int i = 1; i < result.size(); i++) {
        cout << "," << result[i];
    }

    return 0;
}

JS题解代码

function wordSegmentation(s, dictionary) {
    const n = s.length;
    const dp = new Array(n + 1).fill(false);
    dp[0] = true;
    const result = [];

    for (let i = 1; i <= n; i++) {
        for (let j = i - 1; j >= 0; j--) {
            if (dp[j] && dictionary.includes(s.substring(j, i))) {
                dp[i] = true;
                break;
            }
        }
    }

    if (!dp[n]) {
        for (let i = 0; i < n; i++) {
            result.push(s.charAt(i));
        }
        return result;
    }

    let i = n;
    while (i > 0) {
        for (let j = i - 1; j >= 0; j--) {
            if (dp[j] && dictionary.includes(s.substring(j, i))) {
                result.unshift(s.substring(j, i));
                i = j;
                break;
            }
        }
    }

    return result;
}

// 读取输入
const readline = require('readline');
const rl = readline.createInterface({
    input: process.stdin,
    output: process.stdout
});

let inputString = '';
let dictionaryString = '';

rl.question('请输入待分词语句: ', (s) => {
    inputString = s;

    rl.question('请输入词库: ', (dict) => {
        dictionaryString = dict;

        const dictionary = dictionaryString.split(',');
        const result = wordSegmentation(inputString, dictionary);
        console.log(result.join(', '));

        rl.close();
    });
});