import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class T { /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { Document doc = Jsoup.connect("http://www.xxxx.net/new/new_1.htm") .get(); //.data("query", "Java") //.userAgent("Mozilla") //.cookie("auth", "token") //.timeout(3000) //.post(); Elements resultLinks = doc.select("div.main_l_l"); for(Element e:resultLinks){ Elements tresultLinks = e.select("div.list_body a"); for(Element te:tresultLinks){ String href=te.attr("href"); System.out.println("Start:"+href); Document art = Jsoup.connect(href) .get(); String title = art.select("h1").get(0).html(); String content = art.select("#art_content").get(0).html(); Pattern pattern = Pattern.compile("(?si)<!--NEWSZW_HZH_BEGIN-->(.+?)<!--NEWSZW_HZH_END-->"); Matcher m = pattern.matcher(content); while (m.find()) { content=m.group(1); } System.out.println("*************title********************"); System.out.println(title); System.out.println("*************content********************"); System.out.println(content); } } } }