网页去重

package com.distinct.servlet.url;



import java.io.IOException;

import java.io.PrintWriter;

import java.sql.Connection;

import java.sql.DriverManager;

import java.sql.ResultSet;

import java.sql.SQLException;

import java.sql.Statement;

import java.util.HashSet;

import java.util.Iterator;

import java.util.Set;

import javax.servlet.ServletConfig;

import javax.servlet.ServletContext;

import javax.servlet.ServletContextEvent;

import javax.servlet.ServletException;

import javax.servlet.http.HttpServlet;

import javax.servlet.http.HttpServletRequest;

import javax.servlet.http.HttpServletResponse;




@SuppressWarnings("serial")

public class DBServlet extends HttpServlet {

// 定义一个ServletConfig对象

private ServletConfig config = null;

// 初始化结果集

ResultSet selectRes = null;

// 定义私有字符串常量并初始化

private String driverName = "";

// 定义的数据库用户名

private String  username = "";

// 定义的数据库连接密码

private String password = "";

//数据库的连接路径

    private String DB_URL = "";

//连接的数据库的表

    private String mysqltable = "";

// 初始化连接

private Connection connect;

// 初始化数据库操作

private Statement stmtement;

 

public void contextInitialized(ServletContextEvent sce) {

}


/**

 * 

 * @param path 配置文件的路径

 * @param sc ServletContext对象

 */

public void init(ServletConfig config){

this.driverName = config.getInitParameter("DRIVERNAME"); 

this.DB_URL = config.getInitParameter("DB_URL");

this.username = config.getInitParameter("USERNAME");

this.password = config.getInitParameter("PASSWORD");

this.mysqltable = config.getInitParameter("MYTABLE");

 

    

}


// 处理 GET 方法请求的方法

protected void doGet(HttpServletRequest request, HttpServletResponse response)

throws ServletException, IOException {

// 设置响应内容类型

PrintWriter out = response.getWriter();

/* String title = "Using GET Method to Read Form Data"; String docType =

* "<!doctype html public \"-//w3c//dtd html 4.0 " +

* "transitional//en\">\n"; out.println(docType + "<html>\n" +

* "<head><title>" + title + "</title></head>\n" +

* "<body bgcolor=\"#f0f0f0\">\n" + "<h1 align=\"center\">" + title +

* "</h1>\n" + "<ul>\n" + "  <li><b>url</b>:" +

* request.getParameter("url") + "\n" + "  <li><b>title</b>:" +

* request.getParameter("title") + "\n" + "  <li><b>time</b>:" +

* request.getParameter("time") + "\n" + "  <li><b>source</b>:" +

* request.getParameter("source") + "\n" + "</ul>\n" +

* "</body></html>");

*/

}


// 处理 POST 方法请求的方法

public void doPost(HttpServletRequest request, HttpServletResponse response ) throws ServletException, IOException {

//页面的显示格式

response.setContentType("text/html");

request.setCharacterEncoding("UTF-8");


doGet(request, response);


}


@Override

protected synchronized void service(HttpServletRequest request, HttpServletResponse response)

throws ServletException, IOException{


doPost(request, response);

//获取当前时间

long startTime = System.currentTimeMillis();

// 客户端输入的参数;

String m_Url ;

m_Url = request.getParameter("url");

System.out.println(m_Url);

// 抓取新闻的标题

String m_Title ;

   m_Title = request.getParameter("title");

System.out.println(m_Title);

// 新闻的发布时间

String Createtime ;

Createtime = request.getParameter("time");

System.out.println(Createtime);

String m_CreatetimeFormat = "";

// 新闻的抓取来源

String m_Source = request.getParameter("source");

System.out.println(m_Source);

// 新闻的发布单位

String m_publisher_location = request.getParameter("m_publisher_location");

System.out.println(m_publisher_location);

// 新闻的语言类

String language = request.getParameter("language");

System.out.println(language);

// 默认编码

// System.out.println(java.nio.charset.Charset.defaultCharset());

//记录抓取的全部路径

String t_Url = "";

//数据查询语句

String selectSql = "SELECT m_Url,m_Title,m_CreatetimeFormat,m_Source,m_publisher_location,id,language,t_Url FROM "

+ this.mysqltable ;

// 去重逻辑

if (m_Url != null && m_CreatetimeFormat != null && m_Title != null && m_Title != "") {

//截取时间的年月日的格式

if(Createtime.length()>=10 ){

m_CreatetimeFormat = Createtime.substring(0, 10);

}else{

m_CreatetimeFormat = Createtime;

}

String value4 = null;

String value6 = null;

try {

// 注册 JDBC 驱动器

System.out.println(this.driverName);

Class.forName(this.driverName);

// 加载MYSQL JDBC驱动程序

System.out.println("Success loading Mysql Driver!");

//打开一个连接

connect = DriverManager.getConnection(this.DB_URL, this.username, this.password);


stmtement = connect.createStatement();

// 查询数据并输出

selectRes = stmtement.executeQuery(selectSql);

int i = 0;


while (selectRes.next()) {

// 循环输出结果集

String value1 = selectRes.getString(1);

// response.getWriter().println(value1 + "<BR>");

String value2 = selectRes.getString(2);

// response.getWriter().println(value2+ "<BR>");

String value3 = selectRes.getString(3);

// response.getWriter().println(value3 + "<BR>");

value4 = selectRes.getString(4);

String value5 = selectRes.getString(5);

value6 = selectRes.getString(6);

String value7 = selectRes.getString(7);

String value8 = selectRes.getString(8);


float similarity = levenshtein(m_Title, value2);

// System.out.println(similarity);


// 抓取新聞的去重逻辑

// *m_Source可能存在重复值

if (!m_Url.equals(value1) && m_publisher_location.equals(value5)) {

// 计算标题相似度

if (similarity > 0.9) {

// m_Title==value2

// 存在不同url的相同网页,请不用抓取

if (m_CreatetimeFormat.equals(value3)) {

t_Url = m_Url;

t_Url = t_Url + ";" + value8;

//m_Source和 t_Url去重

value4 = value4 + ";" + m_Source;

//去掉value里面相同的值

String[] aa =value4.split(";");

Set set=new HashSet();

for(int il=0;il<aa.length;il++){

set.add(aa[il]);

}

value4 = "";

  for( Iterator   it = set.iterator(); it.hasNext(); )

  {      String aaa = it.next().toString();       

   value4 = aaa + ";" + value4;           

   }

i = -1;

break;

} else {// 存在不同url的更新网页,请抓取

i = i + 1;

}

} else {

continue;

}


}


else if (!m_Url.equals(value1) && !m_publisher_location.equals(value5)) {

continue;

}


else { // m_Url == value1 && m_publisher_location == value5)

if (m_CreatetimeFormat.equals(value3)) {

i = -1;

break;

} else {

// 存在相同网页的更新数据,请抓取;

i = i + 1;

continue;

}

}

}


if (i == -1) {


// response.getWriter().println("网页数据已存在,这条数据不要抓取" +"<BR>");

response.getWriter().println("false");

String updateSql = "UPDATE " + this.mysqltable + " SET m_Source = '" + value4 + "', " + " t_Url ='" + t_Url + "' WHERE id ="

+ value6;

stmtement.executeUpdate(updateSql); 

} else if (i == 0) {


// response.getWriter().println("这条新数据需要抓取" + "<BR>");

response.getWriter().println("true");

String insertSql = "INSERT INTO " + this.mysqltable

+ "(language,m_Url,m_Title,m_CreatetimeFormat,m_Source,m_publisher_location,t_Url) VALUES ( '"

+ language + "','" + m_Url + "', '" + m_Title + "','" + m_CreatetimeFormat + "','"

+ m_Source + "','" + m_publisher_location + "','" + t_Url + "')";

stmtement.executeUpdate(insertSql);


} else {


// response.getWriter().println("网页数据已更新,这条数据需要抓取" +"<BR>");

response.getWriter().println("true");

String insertSql = "INSERT INTO " + this.mysqltable

+ "(language,m_Url,m_Title,m_CreatetimeFormat,m_Source,m_publisher_location,t_Url) VALUES ( '"

+ language + "','" + m_Url + "', '" + m_Title + "','" + m_CreatetimeFormat + "','"

+ m_Source + "','" + m_publisher_location + "','" + t_Url + "')";

stmtement.executeUpdate(insertSql);

}

// 清理环境

selectRes.close();

stmtement.close();

connect.close();

} catch (SQLException | ClassNotFoundException e) {

// 处理 Class.forName 错误

e.printStackTrace();

//logger.error(e);

} finally {

// 最后是用于关闭资源的块

try {

if (stmtement != null)

stmtement.close();

} catch (SQLException se2) {

// logger.error("关闭数据库出现异常",se2);

} // 我们不能做什么

try {

if (connect != null)

connect.close();

} catch (SQLException se) {

se.printStackTrace();

//logger.error("连接数据库出现异常",se);

} // end finally try

} // end try

} else {

// response.getWriter().println("输入数据格式不正确,请重新输入!" + "<BR>");

response.getWriter().println("null");

System.out.println("Some information was missing. Please see below for details.");

System.out.println("m_Url,m_Title,m_CreatetimeFormat :Value can't be null");

}

//测试程序运行时间

long endTime = System.currentTimeMillis();

System.out.println("程序运行时间:"+(endTime-startTime)+"ms");

}

//正向,反向字符串比较

public static float levenshtein(String str1, String str2) {


int len1 = str1.length();

int len2 = str2.length();


int[][] dif = new int[len1 + 1][len2 + 1];


for (int a = 0; a <= len1; a++) {

dif[a][0] = a;

}

for (int a = 0; a <= len2; a++) {

dif[0][a] = a;

}


int temp;

for (int i = 1; i <= len1; i++) {

for (int j = 1; j <= len2; j++) {

if (str1.charAt(i - 1) == str2.charAt(j - 1)) {

temp = 0;

} else {

temp = 1;

}

// 取三个值中最小的

dif[i][j] = min(dif[i - 1][j - 1] + temp, dif[i][j - 1] + 1, dif[i - 1][j] + 1);

}

}

// System.out.println("字符串\"" + str1 + "\"与\"" + str2 + "\"的比较");

// System.out.println("差异步骤:" + dif[len1][len2]);

// 计算相似度

float similarity = 1 - (float) dif[len1][len2] / Math.max(str1.length(), str2.length());

// System.out.println("相似度:" + similarity);

return similarity;

}


private static int min(int... is) {

int min = Integer.MAX_VALUE;

for (int i : is) {

if (min > i) {

min = i;

}

}

return min;

}

}


你可能感兴趣的:(网页去重)