双维度文本查重算法

思路:

双维度文本查重算法_第1张图片

调用detect方法即可得到重复率(float)

代码如下:

package com.hnisc.cmpas.util;

import java.util.ArrayList;
import java.util.Date;
import java.util.List;

public class DuplicateDetection {
    private static final String html_seperator="
"; private static final String sentence_seperator="[,.;!:,。;!:]"; //将一段文字切割成若干句 public static String[] split(String src) { List result=new ArrayList<>(); String []htmls=src.split(html_seperator); for (String s:htmls) for (String s2:s.split(sentence_seperator)) result.add(s2); String []results=new String[result.size()]; result.toArray(results); return results; } //清晰字符串除杂消除干扰 public static String[] clean(String []src) { for (int i=0;iresult) result=temp1; //基于找公共字符数进行相似度计算 float temo2=checkDuplicationWithMatrix(line,s); if (temo2>result) result=temo2; } return result; } //将一行和一行比较,求出公共子序列,以此得出两字符串的相似度 private static float checkSingleLineWithSingleLine(String line,String src) { float result; String s1 = line; String s2 = src; String max = s1.length() >= s2.length()?s1:s2; String min = s1.length() >= s2.length()?s2:s1; int l = 0; String s =""; for(int i=0;il){ l=j-i; s=min.substring(i,j); } } } result=s.length(); result/=line.length(); return result; } //将一行和一行比较,使用集合法求出相似度 private static float checkDuplicationWithMatrix(String s1,String s2) { float result; int count=0; for (int i=0;i }
System.out.println();
}
}
}"; String src="public class AAAA {
// 打印九九乘法表
public static void nineSortTest(){
for(int index1 = 1; index1 <= 9; index1++) {
for (int index2 = 1; index2 <= index1; index2++) {
System.out.print(index2+\"*\"+index1+\"=\"+index2*index1+\"\\t\");
}
System.out.println();
}
}
public static void main(String[] args) {

nineSortTest();

}
}"; System.out.println(transferFloatToPersentString(detect(src,tar))); Date end=new Date(); System.out.println("花费时间:"+(end.getTime()-start.getTime())+"毫秒"); } }

运行结果:

你可能感兴趣的:(算法)