未登录词识别

未登录词识别:不在词典中的词

---新词:杀马特

---命名实体:奥克兰


主要解决方案:基于规则合词,然后通过百度验证。

Start Char Char    1-2-Combine    #[图 n][里 f][市场 n][站 n]
Start Char Char Char    1-3-Combine    #
Start Char Char Char Char    1-4-Combine    #
Start Char Char Char Char Char    1-5-Combine    #
Start Char Char Char Char Char Char    1-6-Combine    #
Start Direction Char    1-2-Combine    #东澳站 南势站
Start Char Word    1-2-Combine    #[台 j][中港 nz][站 n]
Word Char Keyword    0-1-Combine    #[梨园 nz][寮 g][站 v][白沙 nz][屯 ng][站 n]
Char Char Keyword    0-1-Combine    #[商水县 ns][黄 a][寨 ng][站 n]
NumPrefix Num    0-1-Seq    #地五医院
Num NumSuffix    0-1-Seq    #93/号/酒家
Num Num    0-1-Combine #
Num Num Num    0-2-Combine #
Num Num Num Num    0-3-Combine #
Num Num Num Num Num    0-4-Combine #
Num Num Num Num Num Num    0-5-Combine #
Num Num Num Num Num Num Num    0-6-Combine #
Num Num Num Num Num Num Num Num    0-7-Combine #
Num Num Num Num Num Num Num Num Num    0-8-Combine #
Num Num Num Num Num Num Num Num Num Num    0-9-Combine #
Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter    0-10-Combine    #
Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter    0-9-Combine    #
Letter Letter Letter Letter Letter Letter Letter Letter Letter    0-8-Combine    #
Letter Letter Letter Letter Letter Letter Letter Letter    0-7-Combine    #
Letter Letter Letter Letter Letter Letter Letter    0-6-Combine    #
Letter Letter Letter Letter Letter Letter    0-5-Combine    #
Letter Letter Letter Letter Letter    0-4-Combine    #
Letter Letter Letter Letter    0-3-Combine    #
Letter Letter Letter    0-2-Combine    #
Letter Letter    0-1-Combine    #
Num NumSuffix Keyword    0-1-Seq    #海口1号场BLACKSTONE球场
Num Char Char Keyword    0-2-Combine    #八里岔中学
Char Num Char Keyword    0-2-Combine    #八里岔中学
Char Char Num Keyword    0-2-Combine    #八里岔中学
ackage cn.tianditu.mt.common;



import java.io.BufferedReader;

import java.io.FileNotFoundException;

import java.io.FileReader;

import java.io.IOException;

import java.util.ArrayList;

import java.util.LinkedList;

import java.util.List;



import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;



public class Grammar {

    

    protected static Log logger = LogFactory.getLog(Grammar.class);

    

    public final class TSTNode {

        public CombinRule data = null;

        protected TSTNode loNode;

        protected TSTNode eqNode;

        protected TSTNode hiNode;

        protected SegMarkType splitchar;



        public TSTNode(SegMarkType type) {

            this.splitchar = type;

        }

    }



    public TSTNode rootNode;

    

    public TSTNode add(List<SegMarkType> word) {

        if (null == word) {

            throw new NullPointerException("空指针异常");

        }



        int charIndex = 0;

        if (null == rootNode) {

            rootNode = new TSTNode(word.get(0));

        }

        TSTNode currentNode = rootNode;

        while (true) {

            int charComp = word.get(charIndex).compareTo(currentNode.splitchar);

            if (charComp == 0) {

                charIndex++;

                if (charIndex == word.size()) {

                    return currentNode;

                }

                if (null == currentNode.eqNode) {

                    currentNode.eqNode = new TSTNode(word.get(charIndex));

                }

                currentNode = currentNode.eqNode;

            } else if (charComp < 0) {

                if (null == currentNode.loNode) {

                    currentNode.loNode = new TSTNode(word.get(charIndex));

                }

                currentNode = currentNode.loNode;

            } else {

                if (null == currentNode.hiNode) {

                    currentNode.hiNode = new TSTNode(word.get(charIndex));

                }

                currentNode = currentNode.hiNode;

            }

        }

    }



    protected TSTNode getNode(List<SegMarkType> word) {

        if (null == word) {

            return null;

        }

        int len = word.size();

        if (len == 0)

            return null;

        TSTNode currentNode = rootNode; // 匹配过程中的当前节点的位置

        int charIndex = 0; // 表示当前要比较的字符在Key中的位置

        SegMarkType cmpChar = word.get(charIndex);

        int charComp;

        while (true) {

            if (currentNode == null) {// 没找到

                return null;

            }

            charComp = cmpChar.compareTo(currentNode.splitchar);

            if (charComp == 0) {// 相等往下走

                charIndex++;

                if (charIndex == len) {// 找到了

                    return currentNode;

                } else {

                    cmpChar = word.get(charIndex);// 词往下走

                }

                currentNode = currentNode.eqNode;

            } else if (charComp < 0) {// 小于往左走

                currentNode = currentNode.loNode;

            } else {// 大于往右走

                currentNode = currentNode.hiNode;

            }

        }

    }



    public MatchRet matchLong(List<WordInfo> tokens, int offset) {

        if (tokens == null || rootNode == null) {

            return null;

        }



        MatchRet ret = null;

        TSTNode currentNode = rootNode;

        int index = offset;

        while (currentNode != null) {

            int charComp = tokens.get(index).getType().compareTo(

                    currentNode.splitchar);

            if (charComp == 0) {

                index++;

                if (currentNode.data != null) {

                    ret = new MatchRet(currentNode, index);

                }

                if (index == tokens.size()) {

                    return ret;

                }

                currentNode = currentNode.eqNode;

            } else if (charComp < 0) {

                currentNode = currentNode.loNode;

            } else {

                currentNode = currentNode.hiNode;

            }

        }

        return ret;

    }



    /**

     * 根据语法规则进行合并

     * 支持多次合并

     * 且保留了源序列

     * @param tokens

     * @param rules

     * @return

     */

    private List<WordInfo> combineByRules(List<WordInfo> tokens,List<Combin> rules){

        if(rules==null){

            return tokens;

        }

        List<WordInfo> list=new ArrayList<WordInfo>();

        for (int i = 0; i < tokens.size();) {

            for (Combin com : rules) {

                if(i==com.getStart()){

                    int start=com.getStart();

                    int end=com.getEnd();

                    

                    List<WordInfo> sub=tokens.subList(start, end+1);//前闭后开

                    StringBuilder buff=new StringBuilder();

                    for (WordInfo wordInfo : sub) {

                        buff.append(wordInfo.getCn());

                    }                    

                    String cn=buff.toString();

                    SegMarkType type=com.getType();                    

                    WordInfo info=new WordInfo(cn,null,type,sub);                

                    list.add(info);                    

                    i=end+1;                    

                    continue;

                }        

            }

            list.add(tokens.get(i));

            i++;        

        }

        return list;

    }

    

    /**

     * 仅支持一次合并,不支持内部的多次合并,即无法达到有限状态机的效果

     * @param tokens

     * @param rules

     */

    @SuppressWarnings("unused")

    private void CombineOnce(LinkedList<WordInfo> tokens,

            List<Combin> rules) {



        for (Combin com : rules) {

            int start = com.getStart();

            int end = com.getEnd();

            SegMarkType type = com.getType();

            

            StringBuilder buff=new StringBuilder();

            for (int i = start; i <= end; i++) {

                WordInfo word=tokens.get(i);

                buff.append(word.getCn());

            }

            

            int dis=end-start+1;

            for (int i = 0; i < dis; i++) {

                tokens.remove(start);

            }

            

            String cn=buff.toString();

            WordInfo info=new WordInfo(cn,null,type);

            tokens.add(start, info);            

        }

    }



    public List<WordInfo> tag(List<WordInfo> tokens) {

        if (tokens == null || rootNode == null) {

            return null;

        }

        List<Combin> rules = new ArrayList<Combin>();

        for (int i = 0; i < tokens.size();) {

            MatchRet ret = matchLong(tokens, i);

            if (null != ret) {

                CombinRule rule = ret.getNode().data;//找到了树上的东西

                int indexCurrent = ret.getIndex()-1;

                List<Combin> list_com = rule.getPosition();

                for (Combin com : list_com) {

                    int start = indexCurrent - rule.getLen() + 1

                            + com.getStart();

                    int end = indexCurrent - rule.getLen() + 1 + com.getEnd();

                    Combin c = new Combin(start, end, com.getType());//拿到规则

                    rules.add(c);//放入规则列表

                }

                i = ret.getIndex();

            } else {

                i++;

            }

        }

        List<WordInfo> words= combineByRules(tokens,rules);//根据规则合并

        return words;

    }



    public Grammar(Config config){

        loadGrammar(config.getBasicGramFileName());

        loadGrammar(config.getGramFileName());

    }

    

    

    public void loadGrammar(String gramFileName){

        try {

            FileReader fileReader = new FileReader(gramFileName);

            BufferedReader reader = new BufferedReader(fileReader);

            String line;

            try {

                while ((line = reader.readLine()) != null) {

                    String[] arr=line.split("\t");

                    

                    List<SegMarkType> seq=FormSeq(arr[0]);

                    CombinRule rule=FormRule(arr[1],seq.size());                    

                    TSTNode node = this.add(seq);

                    node.data=rule;

                }

            } catch (NullPointerException e) {

                logger.info(e.getMessage());

                logger.info(e.getStackTrace());

            } catch (IllegalArgumentException e) {

                logger.info(e.getMessage());

                logger.info(e.getStackTrace());

            } catch (IOException e) {

                logger.info(e.getMessage());

                logger.info(e.getStackTrace());

            }

        } catch (FileNotFoundException e) {

            logger.info(e.getMessage());

            logger.info(e.getStackTrace());

        }

    }

    

    

    

    

    private CombinRule FormRule(String line,int size) {

        

        List<Combin> rec = new ArrayList<Combin>();

        String[] arr_1=line.split("#");

        for (String str : arr_1) {

            String[] arr_2=str.split("-");

            int start = Integer.parseInt(arr_2[0]);

            int end=Integer.parseInt(arr_2[1]);

            SegMarkType type=Enum.valueOf(SegMarkType.class, arr_2[2].trim());

            Combin pos = new Combin(start, end, type);

            rec.add(pos);

        }        

        CombinRule rule = new CombinRule(rec,size);

        return rule;

    }



    private List<SegMarkType> FormSeq(String string) {

        List<SegMarkType> list=new ArrayList<SegMarkType>();

        String[] arr=string.split(" ");

        for (String str : arr) {

            SegMarkType type=Enum.valueOf(SegMarkType.class, str);

            list.add(type);

        }

        return list;

    }

    

}

 

你可能感兴趣的:(登录)