这两天打算把魔兽世界的官方漫画<王者归来>看看,体会下刀疤男的复仇历程.google后发现uuu9上有中文版,但是只能在线看,每次都要点击图片最大化看,很烦.于是想下载下来看.用firebug查看,发现页面写的还算规范.就是用jsoup解析图片地址,然后用url下载下来.很简单,也不想用多线程了.就这样直接一个一个来吧.毕竟我家里4M的网速不是盖的.
package pic; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.Map; import java.util.Set; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class Spider { /** * @param args */ public static void main(String[] args) { File dir = new File("F:/国王归来"); if (!dir.exists()) { dir.mkdirs(); } Spider spider = new Spider(); spider.catalog(dir, "http://wow.uuu9.com/2008/200812/187521.shtml"); spider.prey(dir); } /** * 分析目录 * * @param address */ private void catalog(File dir, String address) { try { URL url = new URL(address); Document doc = Jsoup.parse(url, 1000 * 3); Element body = doc.body(); Element textworld = body.getElementsByClass("textworld").first(); Element table = textworld.getElementsByTag("table").first(); Elements hrefs = table.getElementsByTag("a"); Map<File, String> map = new LinkedHashMap<File, String>(); File catalog = new File(dir, "catalog.txt"); BufferedWriter bw = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(catalog), "UTF-8")); for (Element a : hrefs) { if (a.children().size() == 0) { continue; } Element strong = a.child(0); if (strong != null && strong.tagName().equalsIgnoreCase("strong")) { String title = strong.text() .replaceAll("[\\.\\-\\::]", "_") .replaceAll("\\s", ""); File f = new File(dir, title); if (!f.exists()) { f.mkdirs(); } String href = a.attr("href"); bw.write(title + "(" + href + ")\r\n"); map.put(f, href); } } bw.close(); for (Map.Entry<File, String> entry : map.entrySet()) { File f = entry.getKey(); Set<String> set = new LinkedHashSet<String>(); this.section(set, entry.getValue()); OutputStreamWriter osw = new OutputStreamWriter( new FileOutputStream(new File(f, "catalog.txt")), "UTF-8"); int i = 1; for (String src : set) { osw.write(i++ + "(" + src + ")\r\n"); } osw.close(); } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } /** * 分析章节 * * @param address * 章节地址 */ private void section(Set<String> set, String address) { try { URL url = new URL(address); Document doc = Jsoup.parse(url, 1000 * 10); Element body = doc.body(); Element div = body.getElementsByClass("textworld").first(); Element img = div.getElementsByTag("img").first(); String src = img.attr("src"); System.out.println(src); set.add(src); Element none = div.getElementById("pagecount"); Element links = none.previousElementSibling(); Element font = links.getElementsByTag("font").first(); Element next = font.nextElementSibling(); if (next.text().matches("\\[\\d+\\]")) { this.section(set, next.absUrl("href")); } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } } /** * 下载图片 * * @param dir */ private void prey(File dir) { for (File f : dir.listFiles()) { if (!f.isDirectory()) { continue; } File catalog = new File(f, "catalog.txt"); if (!catalog.exists()) { continue; } try { BufferedReader reader = new BufferedReader( new InputStreamReader(new FileInputStream(catalog), "UTF-8")); String line = null; while ((line = reader.readLine()) != null) { String path = line.substring(line.indexOf("(") + 1, line .length() - 1); try { URL url = new URL(path); HttpURLConnection con = (HttpURLConnection) url .openConnection(); InputStream is = con.getInputStream(); OutputStream os = new FileOutputStream(new File(f, path .substring(path.lastIndexOf("/") + 1))); byte[] b = new byte[1024 * 4]; int l = -1; while ((l = is.read(b)) != -1) { os.write(b, 0, l); } os.flush(); os.close(); con.disconnect(); System.out.println(path + " download to " + dir.getCanonicalPath() + " complete."); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } } reader.close(); if (catalog.renameTo(new File(f, "catalog"))) { catalog.delete(); } } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } }