获取一串拼音所有可能性的分割方式

场景

比如我输入一串拼音"luan",我可能希望搜索到的是"lu an"或者"luan",六安、乱

实现方法:

1.正则匹配
2.使用拼音词典匹配

public class Test(){

    @Test
    void splitPinyin() {
        long start = System.currentTimeMillis();
        System.out.println(new Date());
        try {
            System.out.println(JSON.toJSONString(splitPinyin(null,null,"zuanrenganga")));
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }
        System.out.println(new Date());
        System.out.println(System.currentTimeMillis()-start);
    }

    public List<List<String>> splitPinyin(List<List<String>> list,List<String> currentList,String pinyin) throws IOException, ClassNotFoundException {
        if(list==null){
            list= new ArrayList<>();
        }
        if(currentList==null){
            currentList= new ArrayList<>();
        }
        List<String> res = new ArrayList<>();
        int i=1;
        while (i<=pinyin.length()){
            System.out.println("==========="+i+"===="+pinyin);
            String s = pinyin.substring(0,i);
            String end = pinyin.substring(i);
            System.out.println("s-->"+s);
            System.out.println("e-->"+end);
            String pattern = "(ang|an|ai|a|ao|bang|ba[ino]?|beng|be[in]?|bing|bia[no]?|bi[en]?|bu|cang|ca[ino]?|ceng|ce[in]?|chang|cha[ino]?|cheng|che[n]?|chi|chong|chou|chuang|chua[in]|chu[ino]?|ci|cong|cou|cuan|cu[ino]?|dang|da[ino]?|deng|de[in]?|dia[no]?|ding|di[ae]?|dong|dou|duan|du[ino]?|e|er|fang|fan|fa|feng|fe[in]{1}|fo[u]?|fu|gang|ga[ino]?|geng|ge[in]?|gong|gou|guang|gua[in]?|gu[ino]?|hang|ha[ino]?|heng|he[in]?|hong|hou|huang|hua[in]?|hu[ino]?|jiang|jia[no]?|jiong|jing|ji[nu]?|juan|ju[en]?|kang|ka[ino]?|keng|ke[n]?|kong|kou|kuang|kua[in]?|ku[ino]?|lang|la[ino]?|leng|le[i]?|liang|lia[no]?|ling|li[enu]?|long|lou|luan|lu[no]?|lv[e]?|mang|ma[ino]?|meng|me[in]?|mia[no]?|ming|mi[nu]?|mo[u]?|mu|nang|na[ino]?|neng|ne[in]?|niang|nia[no]?|ning|ni[enu]?|nong|nou|nuan|nu[on]?|nv[e]?|o|pang|pa[ino]?|pa|peng|pe[in]?|ping|pia[no]?|pi[en]?|po[u]?|pu|qiang|qia[no]?|qiong|qing|qi[aenu]?|quan|qu[en]?|rang|ra[no]{1}|reng|re[n]?|rong|rou|ri|ruan|ru[ino]?|sang|sa[ino]?|seng|se[n]?|shang|sha[ino]?|sheng|she[in]?|shi|shou|shuang|shua[in]?|shu[ino]?|si|song|sou|suan|su[ino]?|tang|ta[ino]?|teng|te|ting|ti[e]?|tia[no]?|tong|tou|tuan|tu[ino]?|wang|wa[ni]?|weng|we[in]{1}|w[ou]{1}|xiang|xia[no]?|xiong|xing|xi[enu]?|xuan|xu[en]?|yang|ya[no]?|ye|ying|yi[n]?|yong|you|yo|yuan|yu[en]?|zang|za[ino]?|zeng|ze[in]?|zhang|zha[ino]?|zheng|zhe[in]?|zhi|zhong|zhou|zhuang|zhua[in]?|zhu[ino]?|zi|zong|zou|zuan|zu[ino]?){1}";
            String pattern2 = "(ang|an|ai|a|ao|bang|ba[ino]?|beng|be[in]?|bing|bia[no]?|bi[en]?|bu|cang|ca[ino]?|ceng|ce[in]?|chang|cha[ino]?|cheng|che[n]?|chi|chong|chou|chuang|chua[in]|chu[ino]?|ci|cong|cou|cuan|cu[ino]?|dang|da[ino]?|deng|de[in]?|dia[no]?|ding|di[ae]?|dong|dou|duan|du[ino]?|e|er|fang|fan|fa|feng|fe[in]{1}|fo[u]?|fu|gang|ga[ino]?|geng|ge[in]?|gong|gou|guang|gua[in]?|gu[ino]?|hang|ha[ino]?|heng|he[in]?|hong|hou|huang|hua[in]?|hu[ino]?|jiang|jia[no]?|jiong|jing|ji[nu]?|juan|ju[en]?|kang|ka[ino]?|keng|ke[n]?|kong|kou|kuang|kua[in]?|ku[ino]?|lang|la[ino]?|leng|le[i]?|liang|lia[no]?|ling|li[enu]?|long|lou|luan|lu[no]?|lv[e]?|mang|ma[ino]?|meng|me[in]?|mia[no]?|ming|mi[nu]?|mo[u]?|mu|nang|na[ino]?|neng|ne[in]?|niang|nia[no]?|ning|ni[enu]?|nong|nou|nuan|nu[on]?|nv[e]?|o|pang|pa[ino]?|pa|peng|pe[in]?|ping|pia[no]?|pi[en]?|po[u]?|pu|qiang|qia[no]?|qiong|qing|qi[aenu]?|quan|qu[en]?|rang|ra[no]{1}|reng|re[n]?|rong|rou|ri|ruan|ru[ino]?|sang|sa[ino]?|seng|se[n]?|shang|sha[ino]?|sheng|she[in]?|shi|shou|shuang|shua[in]?|shu[ino]?|si|song|sou|suan|su[ino]?|tang|ta[ino]?|teng|te|ting|ti[e]?|tia[no]?|tong|tou|tuan|tu[ino]?|wang|wa[ni]?|weng|we[in]{1}|w[ou]{1}|xiang|xia[no]?|xiong|xing|xi[enu]?|xuan|xu[en]?|yang|ya[no]?|ye|ying|yi[n]?|yong|you|yo|yuan|yu[en]?|zang|za[ino]?|zeng|ze[in]?|zhang|zha[ino]?|zheng|zhe[in]?|zhi|zhong|zhou|zhuang|zhua[in]?|zhu[ino]?|zi|zong|zou|zuan|zu[ino]?){1,}";
            if(!end.equals("")){
                System.out.println(System.currentTimeMillis());
                if(Pattern.matches(pattern,s)&&Pattern.matches(pattern2,end)){
                    System.out.println(System.currentTimeMillis());
                    List<String> copy = deepCopy(currentList);
                    copy.add(s);
                    list = splitPinyin(list,copy,end);
                }
            } else{
                if(Pattern.matches(pattern,s)){
                    List<String> copy = deepCopy(currentList);
                    copy.add(s);
                    list.add(copy);
                }
                return list;
            }
            i++;
        }
        return list;
    }

    public static <T> List<T> deepCopy(List<T> src) throws IOException, ClassNotFoundException {
//        System.out.println(System.currentTimeMillis());
        ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
        ObjectOutputStream out = new ObjectOutputStream(byteOut);
        out.writeObject(src);

        ByteArrayInputStream byteIn = new ByteArrayInputStream(byteOut.toByteArray());
        ObjectInputStream in = new ObjectInputStream(byteIn);
        @SuppressWarnings("unchecked")
        List<T> dest = (List<T>) in.readObject();
//        System.out.println(System.currentTimeMillis());
        return dest;
    }
}

运行结果

输入拼音:zuanrenganga
输出:[["zu","an","ren","gan","ga"],["zu","an","ren","gang","a"],["zu","an","reng","an","ga"],["zu","an","reng","ang","a"],["zuan","ren","gan","ga"],["zuan","ren","gang","a"],["zuan","reng","an","ga"],["zuan","reng","ang","a"]]

耗时:60ms
耗时分析:遍历和正则匹配

如果使用词典去匹配也是可以的,附上词典文件:
pinyin.txt

发表新评论