Sep 7, 2018 · 14 min read
JAVA抓匯實務示範(3) by htmlunit
規格單: http://www.imf.org/external/index.htm





Coding:
(這邊我的專案是有把新的url一層一層往下一個getData丟,這裡就不多做更動)
/*** 連結抓取資料處理 網頁連線 or 規則複雜的資料截取*/@Overridepublic String getData(String url) throws Exception {HTML htmlx = new HTMLX(url, false);htmlx.setConnectTimeout(parameter.getConnectTimeout(), parameter.getRetry());// 設定網頁連結多久中斷 ,為避免連線過久卡在網頁// htmlx.setReadTimeout(parameter.getReadTimeout(),// parameter.getRetry());htmlx.connect();htmlx.extractInputStream(parameter.getEncoding());String source = htmlx.getCodeStringtype();Document doc = Jsoup.parse(source);Document table = Jsoup.parseBodyFragment(doc.getElementById(“content-main”).toString());String a = table.select(“h4 a”).first().attr(“href”);String source1 = a.toString();return source1;}public String getData2(String url) throws Exception {HTML htmlx = new HTMLX(url, false);htmlx.setConnectTimeout(parameter.getConnectTimeout(), parameter.getRetry());// 設定網頁連結多久中斷 ,為避免連線過久卡在網頁// htmlx.setReadTimeout(parameter.getReadTimeout(),// parameter.getRetry());htmlx.connect();htmlx.extractInputStream(parameter.getEncoding());String source = htmlx.getCodeStringtype();Document doc = Jsoup.parse(source);Document table = Jsoup.parseBodyFragment(doc.getElementById(“content”).toString());String source1 = table.toString();String source2 = “”;source2 = url.replace(“index.aspx”, getUrl(source1));return source2;}public String getData3(String url) throws Exception {HTML htmlx = new HTMLX(url, false);htmlx.setConnectTimeout(parameter.getConnectTimeout(), parameter.getRetry());// 設定網頁連結多久中斷 ,為避免連線過久卡在網頁// htmlx.setReadTimeout(parameter.getReadTimeout(),// parameter.getRetry());htmlx.connect();htmlx.extractInputStream(parameter.getEncoding());String source = htmlx.getCodeStringtype();Document doc = Jsoup.parse(source);Document table = Jsoup.parseBodyFragment(doc.getElementsByClass(“Table740”).toString());String source1 = table.toString();String source2 = “”;source2 = url.replace(“weoselgr.aspx”, getUrl1(source1));return source2;}
這面這些Jsoup段的說明暫略
/*** 模擬瀏覽器*/public void BuildBrowser() {// 關閉日誌輸出(紅色內部運行錯誤)LogFactory.getFactory().setAttribute(“org.apache.commons.logging.Log”,“org.apache.commons.logging.impl.NoOpLog”);webClient = new WebClient(BrowserVersion.FIREFOX_24);// JavaScript元件webClient.getOptions().setJavaScriptEnabled(false);// CSS元件webClient.getOptions().setCssEnabled(false);// AJAX元件webClient.setAjaxController(new NicelyResynchronizingAjaxController());// 網頁TIMEOUTwebClient.getOptions().setTimeout(80000);// 跳轉webClient.getOptions().setRedirectEnabled(true);// IE元件webClient.getOptions().setActiveXNative(true);// 是否拋出頁面javascript錯誤webClient.getOptions().setThrowExceptionOnScriptError(false);// 是否拋出response的錯誤webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);// SSL憑證更新webClient.getOptions().setUseInsecureSSL(true);// 等待JavaScript執行完後的延遲時間webClient.waitForBackgroundJavaScript(10000);// 等待JavaScript執行前的延遲時間webClient.waitForBackgroundJavaScriptStartingBefore(10000);webClient.getCookieManager().setCookiesEnabled(true);webClient.getCurrentWindow().setInnerHeight(Integer.MAX_VALUE);}
開始預備做htmlunit的行為
private WebClient webClient;public String getData4(String url) throws Exception {HtmlPage page = webClient.getPage(url);Page page1 = null;List<DomElement> atags = page.getElementsByTagName(“input”);for (DomElement atag : atags) {if (atag.getAttribute(“id”).equals(“bc”)) {page1 = ((HtmlImageInput) atag).click();}}String source = page1.getUrl().toString();return source;}public String getData5(String url) throws Exception {HtmlPage page = webClient.getPage(url);Page page1 = null;List<DomElement> atags = page.getElementsByTagName(“input”);for (DomElement atag : atags) {// System.out.println(atag);if (atag.getAttribute(“value”).equals(“NGDPDPC”)) {page1 = ((HtmlCheckBoxInput) atag).setChecked(true);}if (atag.getAttribute(“value”).equals(“NGSD_NGDP”)) {page1 = ((HtmlCheckBoxInput) atag).setChecked(true);}if (atag.getAttribute(“value”).equals(“LUR”)) {page1 = ((HtmlCheckBoxInput) atag).setChecked(true);}if (atag.getAttribute(“value”).equals(“LE”)) {page1 = ((HtmlCheckBoxInput) atag).setChecked(true);}if (atag.getAttribute(“value”).equals(“LP”)) {page1 = ((HtmlCheckBoxInput) atag).setChecked(true);}}List<DomElement> atags1 = page.getElementsByTagName(“input”);for (DomElement atag : atags1) {if (atag.getAttribute(“id”).equals(“bc”)) {page1 = ((HtmlImageInput) atag).click();}}String source = page1.getUrl().toString();return source;}public String getData6(String url) throws Exception {HtmlPage page = webClient.getPage(url);HtmlSelect select = page.getElementByName(“sy”);List<HtmlOption> options = select.getOptions();for (HtmlOption Option : options) {if (Option.getAttribute(“value”).equals(“1980”))page = select.setSelectedAttribute(Option, true);}HtmlSelect select2 = page.getElementByName(“ey”);List<HtmlOption> options2 = select2.getOptions();for (HtmlOption Option : options2) {if (Option.getAttribute(“value”).equals(date.substring(0, 4))){page = select2.setSelectedAttribute(Option, true);}}Page page1 = null;List<DomElement> atags = page.getElementsByTagName(“input”);for (DomElement atag : atags) {if (atag.getAttribute(“id”).equals(“pr”)) {page1 = ((HtmlImageInput) atag).click();}}String source = page1.getUrl().toString();return source;}public String getData7(String url) throws Exception {HtmlPage page = webClient.getPage(url);List<DomElement> atags = page.getElementsByTagName(“a”);Page downloadPage = null;for (DomElement atag : atags) {if(atag.asText().contains(“Your WEO Report”))downloadPage = ((HtmlAnchor) atag).click();}download(downloadPage,”WEO_Data”);return “”;}/*** 分析資料 存入暫存 網頁資料剖析 , 外部檔欄位*/private String getUrl(String source) {Document doc = Jsoup.parse(source);String url = “”;Elements linkAll = doc.select(“a[href]”);for (Element link : linkAll) {String value = link.attr(“href”);if (link.toString().contains(“By Countries (country-level data)”)) {url = value;}}return url;}private String getUrl1(String source) {Document doc = Jsoup.parse(source);String url = “”;Elements linkAll = doc.select(“a[href]”);for (Element link : linkAll) {String value = link.attr(“href”);if (link.toString().contains(“All countries”)) {url = value;}}return url;}// 下載xls 用 輸入檔名 丟入tempprivate void download(Page page,String filename) throws IOException{InputStream is = page.getWebResponse().getContentAsStream();OutputStream fos = new FileOutputStream(System.getProperty(“user.dir”)+”/data/”+date+filename+”.xls”);byte[] buffer=new byte[1024*30];int len=-1;while((len=is.read(buffer))>0){fos.write(buffer, 0, len);}fos.flush();fos.close();}
20180907只把source code po上來 沒有整理
