Msx Wu
Msx Wu
Sep 7, 2018 · 14 min read

JAVA抓匯實務示範(3) by htmlunit

規格單: http://www.imf.org/external/index.htm

Coding:

(這邊我的專案是有把新的url一層一層往下一個getData丟,這裡就不多做更動)

/*** 連結抓取資料處理 網頁連線 or 規則複雜的資料截取*/@Overridepublic String getData(String url) throws Exception {HTML htmlx = new HTMLX(url, false);htmlx.setConnectTimeout(parameter.getConnectTimeout(), parameter.getRetry());// 設定網頁連結多久中斷 ,為避免連線過久卡在網頁// htmlx.setReadTimeout(parameter.getReadTimeout(),// parameter.getRetry());htmlx.connect();htmlx.extractInputStream(parameter.getEncoding());String source = htmlx.getCodeStringtype();Document doc = Jsoup.parse(source);Document table = Jsoup.parseBodyFragment(doc.getElementById(“content-main”).toString());String a = table.select(“h4 a”).first().attr(“href”);String source1 = a.toString();return source1;}public String getData2(String url) throws Exception {HTML htmlx = new HTMLX(url, false);htmlx.setConnectTimeout(parameter.getConnectTimeout(), parameter.getRetry());// 設定網頁連結多久中斷 ,為避免連線過久卡在網頁// htmlx.setReadTimeout(parameter.getReadTimeout(),// parameter.getRetry());htmlx.connect();htmlx.extractInputStream(parameter.getEncoding());String source = htmlx.getCodeStringtype();Document doc = Jsoup.parse(source);Document table = Jsoup.parseBodyFragment(doc.getElementById(“content”).toString());String source1 = table.toString();String source2 = “”;source2 = url.replace(“index.aspx”, getUrl(source1));return source2;}public String getData3(String url) throws Exception {HTML htmlx = new HTMLX(url, false);htmlx.setConnectTimeout(parameter.getConnectTimeout(), parameter.getRetry());// 設定網頁連結多久中斷 ,為避免連線過久卡在網頁// htmlx.setReadTimeout(parameter.getReadTimeout(),// parameter.getRetry());htmlx.connect();htmlx.extractInputStream(parameter.getEncoding());String source = htmlx.getCodeStringtype();Document doc = Jsoup.parse(source);Document table = Jsoup.parseBodyFragment(doc.getElementsByClass(“Table740”).toString());String source1 = table.toString();String source2 = “”;source2 = url.replace(“weoselgr.aspx”, getUrl1(source1));return source2;}

這面這些Jsoup段的說明暫略

/*** 模擬瀏覽器*/public void BuildBrowser() {// 關閉日誌輸出(紅色內部運行錯誤)LogFactory.getFactory().setAttribute(“org.apache.commons.logging.Log”,“org.apache.commons.logging.impl.NoOpLog”);webClient = new WebClient(BrowserVersion.FIREFOX_24);// JavaScript元件webClient.getOptions().setJavaScriptEnabled(false);// CSS元件webClient.getOptions().setCssEnabled(false);// AJAX元件webClient.setAjaxController(new NicelyResynchronizingAjaxController());// 網頁TIMEOUTwebClient.getOptions().setTimeout(80000);// 跳轉webClient.getOptions().setRedirectEnabled(true);// IE元件webClient.getOptions().setActiveXNative(true);// 是否拋出頁面javascript錯誤webClient.getOptions().setThrowExceptionOnScriptError(false);// 是否拋出response的錯誤webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);// SSL憑證更新webClient.getOptions().setUseInsecureSSL(true);// 等待JavaScript執行完後的延遲時間webClient.waitForBackgroundJavaScript(10000);// 等待JavaScript執行前的延遲時間webClient.waitForBackgroundJavaScriptStartingBefore(10000);webClient.getCookieManager().setCookiesEnabled(true);webClient.getCurrentWindow().setInnerHeight(Integer.MAX_VALUE);}

開始預備做htmlunit的行為

private WebClient webClient;public String getData4(String url) throws Exception {HtmlPage page = webClient.getPage(url);Page page1 = null;List<DomElement> atags = page.getElementsByTagName(“input”);for (DomElement atag : atags) {if (atag.getAttribute(“id”).equals(“bc”)) {page1 = ((HtmlImageInput) atag).click();}}String source = page1.getUrl().toString();return source;}public String getData5(String url) throws Exception {HtmlPage page = webClient.getPage(url);Page page1 = null;List<DomElement> atags = page.getElementsByTagName(“input”);for (DomElement atag : atags) {// System.out.println(atag);if (atag.getAttribute(“value”).equals(“NGDPDPC”)) {page1 = ((HtmlCheckBoxInput) atag).setChecked(true);}if (atag.getAttribute(“value”).equals(“NGSD_NGDP”)) {page1 = ((HtmlCheckBoxInput) atag).setChecked(true);}if (atag.getAttribute(“value”).equals(“LUR”)) {page1 = ((HtmlCheckBoxInput) atag).setChecked(true);}if (atag.getAttribute(“value”).equals(“LE”)) {page1 = ((HtmlCheckBoxInput) atag).setChecked(true);}if (atag.getAttribute(“value”).equals(“LP”)) {page1 = ((HtmlCheckBoxInput) atag).setChecked(true);}}List<DomElement> atags1 = page.getElementsByTagName(“input”);for (DomElement atag : atags1) {if (atag.getAttribute(“id”).equals(“bc”)) {page1 = ((HtmlImageInput) atag).click();}}String source = page1.getUrl().toString();return source;}public String getData6(String url) throws Exception {HtmlPage page = webClient.getPage(url);HtmlSelect select = page.getElementByName(“sy”);List<HtmlOption> options = select.getOptions();for (HtmlOption Option : options) {if (Option.getAttribute(“value”).equals(“1980”))page = select.setSelectedAttribute(Option, true);}HtmlSelect select2 = page.getElementByName(“ey”);List<HtmlOption> options2 = select2.getOptions();for (HtmlOption Option : options2) {if (Option.getAttribute(“value”).equals(date.substring(0, 4))){page = select2.setSelectedAttribute(Option, true);}}Page page1 = null;List<DomElement> atags = page.getElementsByTagName(“input”);for (DomElement atag : atags) {if (atag.getAttribute(“id”).equals(“pr”)) {page1 = ((HtmlImageInput) atag).click();}}String source = page1.getUrl().toString();return source;}public String getData7(String url) throws Exception {HtmlPage page = webClient.getPage(url);List<DomElement> atags = page.getElementsByTagName(“a”);Page downloadPage = null;for (DomElement atag : atags) {if(atag.asText().contains(“Your WEO Report”))downloadPage = ((HtmlAnchor) atag).click();}download(downloadPage,”WEO_Data”);return “”;}/*** 分析資料 存入暫存 網頁資料剖析 , 外部檔欄位*/private String getUrl(String source) {Document doc = Jsoup.parse(source);String url = “”;Elements linkAll = doc.select(“a[href]”);for (Element link : linkAll) {String value = link.attr(“href”);if (link.toString().contains(“By Countries (country-level data)”)) {url = value;}}return url;}private String getUrl1(String source) {Document doc = Jsoup.parse(source);String url = “”;Elements linkAll = doc.select(“a[href]”);for (Element link : linkAll) {String value = link.attr(“href”);if (link.toString().contains(“All countries”)) {url = value;}}return url;}// 下載xls 用 輸入檔名 丟入tempprivate void download(Page page,String filename) throws IOException{InputStream is = page.getWebResponse().getContentAsStream();OutputStream fos = new FileOutputStream(System.getProperty(“user.dir”)+”/data/”+date+filename+”.xls”);byte[] buffer=new byte[1024*30];int len=-1;while((len=is.read(buffer))>0){fos.write(buffer, 0, len);}fos.flush();fos.close();}

20180907只把source code po上來 沒有整理

一個私大資管學生的日誌本

學店跟頂大差在哪? 差在學店生的自卑

Msx Wu

Written by

Msx Wu

一個私大資管學生的日誌本

學店跟頂大差在哪? 差在學店生的自卑

Welcome to a place where words matter. On Medium, smart voices and original ideas take center stage - with no ads in sight. Watch
Follow all the topics you care about, and we’ll deliver the best stories for you to your homepage and inbox. Explore
Get unlimited access to the best stories on Medium — and support writers while you’re at it. Just $5/month. Upgrade