package com.distinct.servlet.url;
import java.io.IOException;
import java.io.PrintWriter;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import javax.servlet.ServletConfig;
import javax.servlet.ServletContext;
import javax.servlet.ServletContextEvent;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
@SuppressWarnings("serial")
public class DBServlet extends HttpServlet {
// 定义一个ServletConfig对象
private ServletConfig config = null;
// 初始化结果集
ResultSet selectRes = null;
// 定义私有字符串常量并初始化
private String driverName = "";
// 定义的数据库用户名
private String username = "";
// 定义的数据库连接密码
private String password = "";
//数据库的连接路径
private String DB_URL = "";
//连接的数据库的表
private String mysqltable = "";
// 初始化连接
private Connection connect;
// 初始化数据库操作
private Statement stmtement;
public void contextInitialized(ServletContextEvent sce) {
}
/**
*
* @param path 配置文件的路径
* @param sc ServletContext对象
*/
public void init(ServletConfig config){
this.driverName = config.getInitParameter("DRIVERNAME");
this.DB_URL = config.getInitParameter("DB_URL");
this.username = config.getInitParameter("USERNAME");
this.password = config.getInitParameter("PASSWORD");
this.mysqltable = config.getInitParameter("MYTABLE");
}
// 处理 GET 方法请求的方法
protected void doGet(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
// 设置响应内容类型
PrintWriter out = response.getWriter();
/* String title = "Using GET Method to Read Form Data"; String docType =
* "<!doctype html public \"-//w3c//dtd html 4.0 " +
* "transitional//en\">\n"; out.println(docType + "<html>\n" +
* "<head><title>" + title + "</title></head>\n" +
* "<body bgcolor=\"#f0f0f0\">\n" + "<h1 align=\"center\">" + title +
* "</h1>\n" + "<ul>\n" + " <li><b>url</b>:" +
* request.getParameter("url") + "\n" + " <li><b>title</b>:" +
* request.getParameter("title") + "\n" + " <li><b>time</b>:" +
* request.getParameter("time") + "\n" + " <li><b>source</b>:" +
* request.getParameter("source") + "\n" + "</ul>\n" +
* "</body></html>");
*/
}
// 处理 POST 方法请求的方法
public void doPost(HttpServletRequest request, HttpServletResponse response ) throws ServletException, IOException {
//页面的显示格式
response.setContentType("text/html");
request.setCharacterEncoding("UTF-8");
doGet(request, response);
}
@Override
protected synchronized void service(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException{
doPost(request, response);
//获取当前时间
long startTime = System.currentTimeMillis();
// 客户端输入的参数;
String m_Url ;
m_Url = request.getParameter("url");
System.out.println(m_Url);
// 抓取新闻的标题
String m_Title ;
m_Title = request.getParameter("title");
System.out.println(m_Title);
// 新闻的发布时间
String Createtime ;
Createtime = request.getParameter("time");
System.out.println(Createtime);
String m_CreatetimeFormat = "";
// 新闻的抓取来源
String m_Source = request.getParameter("source");
System.out.println(m_Source);
// 新闻的发布单位
String m_publisher_location = request.getParameter("m_publisher_location");
System.out.println(m_publisher_location);
// 新闻的语言类
String language = request.getParameter("language");
System.out.println(language);
// 默认编码
// System.out.println(java.nio.charset.Charset.defaultCharset());
//记录抓取的全部路径
String t_Url = "";
//数据查询语句
String selectSql = "SELECT m_Url,m_Title,m_CreatetimeFormat,m_Source,m_publisher_location,id,language,t_Url FROM "
+ this.mysqltable ;
// 去重逻辑
if (m_Url != null && m_CreatetimeFormat != null && m_Title != null && m_Title != "") {
//截取时间的年月日的格式
if(Createtime.length()>=10 ){
m_CreatetimeFormat = Createtime.substring(0, 10);
}else{
m_CreatetimeFormat = Createtime;
}
String value4 = null;
String value6 = null;
try {
// 注册 JDBC 驱动器
System.out.println(this.driverName);
Class.forName(this.driverName);
// 加载MYSQL JDBC驱动程序
System.out.println("Success loading Mysql Driver!");
//打开一个连接
connect = DriverManager.getConnection(this.DB_URL, this.username, this.password);
stmtement = connect.createStatement();
// 查询数据并输出
selectRes = stmtement.executeQuery(selectSql);
int i = 0;
while (selectRes.next()) {
// 循环输出结果集
String value1 = selectRes.getString(1);
// response.getWriter().println(value1 + "<BR>");
String value2 = selectRes.getString(2);
// response.getWriter().println(value2+ "<BR>");
String value3 = selectRes.getString(3);
// response.getWriter().println(value3 + "<BR>");
value4 = selectRes.getString(4);
String value5 = selectRes.getString(5);
value6 = selectRes.getString(6);
String value7 = selectRes.getString(7);
String value8 = selectRes.getString(8);
float similarity = levenshtein(m_Title, value2);
// System.out.println(similarity);
// 抓取新聞的去重逻辑
// *m_Source可能存在重复值
if (!m_Url.equals(value1) && m_publisher_location.equals(value5)) {
// 计算标题相似度
if (similarity > 0.9) {
// m_Title==value2
// 存在不同url的相同网页,请不用抓取
if (m_CreatetimeFormat.equals(value3)) {
t_Url = m_Url;
t_Url = t_Url + ";" + value8;
//m_Source和 t_Url去重
value4 = value4 + ";" + m_Source;
//去掉value里面相同的值
String[] aa =value4.split(";");
Set set=new HashSet();
for(int il=0;il<aa.length;il++){
set.add(aa[il]);
}
value4 = "";
for( Iterator it = set.iterator(); it.hasNext(); )
{ String aaa = it.next().toString();
value4 = aaa + ";" + value4;
}
i = -1;
break;
} else {// 存在不同url的更新网页,请抓取
i = i + 1;
}
} else {
continue;
}
}
else if (!m_Url.equals(value1) && !m_publisher_location.equals(value5)) {
continue;
}
else { // m_Url == value1 && m_publisher_location == value5)
if (m_CreatetimeFormat.equals(value3)) {
i = -1;
break;
} else {
// 存在相同网页的更新数据,请抓取;
i = i + 1;
continue;
}
}
}
if (i == -1) {
// response.getWriter().println("网页数据已存在,这条数据不要抓取" +"<BR>");
response.getWriter().println("false");
String updateSql = "UPDATE " + this.mysqltable + " SET m_Source = '" + value4 + "', " + " t_Url ='" + t_Url + "' WHERE id ="
+ value6;
stmtement.executeUpdate(updateSql);
} else if (i == 0) {
// response.getWriter().println("这条新数据需要抓取" + "<BR>");
response.getWriter().println("true");
String insertSql = "INSERT INTO " + this.mysqltable
+ "(language,m_Url,m_Title,m_CreatetimeFormat,m_Source,m_publisher_location,t_Url) VALUES ( '"
+ language + "','" + m_Url + "', '" + m_Title + "','" + m_CreatetimeFormat + "','"
+ m_Source + "','" + m_publisher_location + "','" + t_Url + "')";
stmtement.executeUpdate(insertSql);
} else {
// response.getWriter().println("网页数据已更新,这条数据需要抓取" +"<BR>");
response.getWriter().println("true");
String insertSql = "INSERT INTO " + this.mysqltable
+ "(language,m_Url,m_Title,m_CreatetimeFormat,m_Source,m_publisher_location,t_Url) VALUES ( '"
+ language + "','" + m_Url + "', '" + m_Title + "','" + m_CreatetimeFormat + "','"
+ m_Source + "','" + m_publisher_location + "','" + t_Url + "')";
stmtement.executeUpdate(insertSql);
}
// 清理环境
selectRes.close();
stmtement.close();
connect.close();
} catch (SQLException | ClassNotFoundException e) {
// 处理 Class.forName 错误
e.printStackTrace();
//logger.error(e);
} finally {
// 最后是用于关闭资源的块
try {
if (stmtement != null)
stmtement.close();
} catch (SQLException se2) {
// logger.error("关闭数据库出现异常",se2);
} // 我们不能做什么
try {
if (connect != null)
connect.close();
} catch (SQLException se) {
se.printStackTrace();
//logger.error("连接数据库出现异常",se);
} // end finally try
} // end try
} else {
// response.getWriter().println("输入数据格式不正确,请重新输入!" + "<BR>");
response.getWriter().println("null");
System.out.println("Some information was missing. Please see below for details.");
System.out.println("m_Url,m_Title,m_CreatetimeFormat :Value can't be null");
}
//测试程序运行时间
long endTime = System.currentTimeMillis();
System.out.println("程序运行时间:"+(endTime-startTime)+"ms");
}
//正向,反向字符串比较
public static float levenshtein(String str1, String str2) {
int len1 = str1.length();
int len2 = str2.length();
int[][] dif = new int[len1 + 1][len2 + 1];
for (int a = 0; a <= len1; a++) {
dif[a][0] = a;
}
for (int a = 0; a <= len2; a++) {
dif[0][a] = a;
}
int temp;
for (int i = 1; i <= len1; i++) {
for (int j = 1; j <= len2; j++) {
if (str1.charAt(i - 1) == str2.charAt(j - 1)) {
temp = 0;
} else {
temp = 1;
}
// 取三个值中最小的
dif[i][j] = min(dif[i - 1][j - 1] + temp, dif[i][j - 1] + 1, dif[i - 1][j] + 1);
}
}
// System.out.println("字符串\"" + str1 + "\"与\"" + str2 + "\"的比较");
// System.out.println("差异步骤:" + dif[len1][len2]);
// 计算相似度
float similarity = 1 - (float) dif[len1][len2] / Math.max(str1.length(), str2.length());
// System.out.println("相似度:" + similarity);
return similarity;
}
private static int min(int... is) {
int min = Integer.MAX_VALUE;
for (int i : is) {
if (min > i) {
min = i;
}
}
return min;
}
}