From 9b46b7cf0f6fa3e06d80f323f63f5087086bed40 Mon Sep 17 00:00:00 2001 From: Vivian Delannoy Date: Thu, 8 Apr 2021 17:15:10 +0200 Subject: [PATCH 1/3] little fix smells deprecated decorator in Spider.java --- webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java | 1 + 1 file changed, 1 insertion(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 5940e738d..ce6383751 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -173,6 +173,7 @@ public Spider setUUID(String uuid) { * @param scheduler scheduler * @return this * @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler) + * @deprecated */ @Deprecated public Spider scheduler(Scheduler scheduler) { From fcffe42bd1ff487576c707c375128c8d822cb27b Mon Sep 17 00:00:00 2001 From: Vivian Delannoy Date: Sun, 11 Apr 2021 23:14:43 +0200 Subject: [PATCH 2/3] adding a try-catch-finally to properly close configFileReader --- .../downloader/selenium/WebDriverPool.java | 80 ++++++++++--------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index e1d9dd039..d53630fe7 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -63,46 +63,54 @@ class WebDriverPool { * @throws IOException */ public void configure() throws IOException { - // Read config file - sConfig = new Properties(); - String configFile = DEFAULT_CONFIG_FILE; - if (System.getProperty("selenuim_config")!=null){ - configFile = System.getProperty("selenuim_config"); - } - sConfig.load(new FileReader(configFile)); - - // Prepare capabilities - sCaps = new DesiredCapabilities(); - sCaps.setJavascriptEnabled(true); - sCaps.setCapability("takesScreenshot", false); - - String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS); - - // Fetch PhantomJS-specific configuration parameters - if (driver.equals(DRIVER_PHANTOMJS)) { - // "phantomjs_exec_path" - if (sConfig.getProperty("phantomjs_exec_path") != null) { - sCaps.setCapability( - PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, - sConfig.getProperty("phantomjs_exec_path")); - } else { - throw new IOException( - String.format( - "Property '%s' not set!", - PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY)); + try{ + // Read config file + sConfig = new Properties(); + String configFile = DEFAULT_CONFIG_FILE; + if (System.getProperty("selenuim_config")!=null){ + configFile = System.getProperty("selenuim_config"); } - // "phantomjs_driver_path" - if (sConfig.getProperty("phantomjs_driver_path") != null) { - System.out.println("Test will use an external GhostDriver"); - sCaps.setCapability( - PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_PATH_PROPERTY, - sConfig.getProperty("phantomjs_driver_path")); - } else { - System.out - .println("Test will use PhantomJS internal GhostDriver"); + FileReader configFileReader = new FileReader(configFile) + sConfig.load(configFileReader); + + // Prepare capabilities + sCaps = new DesiredCapabilities(); + sCaps.setJavascriptEnabled(true); + sCaps.setCapability("takesScreenshot", false); + String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS); + // Fetch PhantomJS-specific configuration parameters + if (driver.equals(DRIVER_PHANTOMJS)) { + // "phantomjs_exec_path" + if (sConfig.getProperty("phantomjs_exec_path") != null) { + sCaps.setCapability( + PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, + sConfig.getProperty("phantomjs_exec_path")); + } else { + throw new IOException( + String.format( + "Property '%s' not set!", + PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY)); + } + // "phantomjs_driver_path" + if (sConfig.getProperty("phantomjs_driver_path") != null) { + System.out.println("Test will use an external GhostDriver"); + sCaps.setCapability( + PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_PATH_PROPERTY, + sConfig.getProperty("phantomjs_driver_path")); + } else { + System.out + .println("Test will use PhantomJS internal GhostDriver"); + } } + }catch(Exception e){ + throw new IOException("Can not load config file properly"); + + }finally{ + configFileReader.close(); } + + // Disable "web-security", enable all possible "ssl-protocols" and // "ignore-ssl-errors" for PhantomJSDriver // sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, new From 6b6a419ef956761c8d04ed077fdfdd92b87ac0d9 Mon Sep 17 00:00:00 2001 From: Vivian Delannoy Date: Mon, 12 Apr 2021 00:58:52 +0200 Subject: [PATCH 3/3] code smells fixes --- .../main/java/us/codecraft/webmagic/Page.java | 3 +- .../java/us/codecraft/webmagic/Request.java | 6 +- .../us/codecraft/webmagic/ResultItems.java | 2 +- .../main/java/us/codecraft/webmagic/Site.java | 10 +-- .../java/us/codecraft/webmagic/Spider.java | 17 ++-- .../downloader/AbstractDownloader.java | 2 +- .../downloader/HttpClientDownloader.java | 2 +- .../webmagic/model/HttpRequestBody.java | 8 +- .../webmagic/utils/CharsetUtils.java | 7 +- .../scheduler/ZipCodePageProcessor.java | 1 + .../downloader/selenium/WebDriverPool.java | 81 +++++++++---------- 11 files changed, 68 insertions(+), 71 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index c11df693c..15206b920 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -49,7 +49,7 @@ public class Page { private byte[] bytes; - private List targetRequests = new ArrayList(); + private List targetRequests = new ArrayList<>(); private String charset; @@ -108,6 +108,7 @@ public Json getJson() { * @deprecated since 0.4.0 * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ + @Deprecated public void setHtml(Html html) { this.html = html; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 9fc286192..b73665ab2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -40,9 +40,9 @@ public class Request implements Serializable { /** * cookies for current url, if not set use Site's cookies */ - private Map cookies = new HashMap(); + private Map cookies = new HashMap<>(); - private Map headers = new HashMap(); + private Map headers = new HashMap<>(); /** * Priority of the request.
@@ -94,7 +94,7 @@ public T getExtra(String key) { public Request putExtra(String key, T value) { if (extras == null) { - extras = new HashMap(); + extras = new HashMap<>(); } extras.put(key, value); return this; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java index 488c81e77..273b0a30e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java @@ -14,7 +14,7 @@ */ public class ResultItems { - private Map fields = new LinkedHashMap(); + private Map fields = new LinkedHashMap<>(); private Request request; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 4879b2825..9cbda0222 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -22,9 +22,9 @@ public class Site { private String userAgent; - private Map defaultCookies = new LinkedHashMap(); + private Map defaultCookies = new LinkedHashMap<>(); - private Map> cookies = new HashMap>(); + private Map> cookies = new HashMap<>(); private String charset; @@ -38,11 +38,11 @@ public class Site { private int timeOut = 5000; - private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); + private static final Set DEFAULT_STATUS_CODE_SET = new HashSet<>(); private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; - private Map headers = new HashMap(); + private Map headers = new HashMap<>(); private boolean useGzip = true; @@ -83,7 +83,7 @@ public Site addCookie(String name, String value) { */ public Site addCookie(String domain, String name, String value) { if (!cookies.containsKey(domain)){ - cookies.put(domain,new HashMap()); + cookies.put(domain,new HashMap<>()); } cookies.get(domain).put(name, value); return this; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index ce6383751..925548147 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -62,7 +62,7 @@ public class Spider implements Runnable, Task { protected Downloader downloader; - protected List pipelines = new ArrayList(); + protected List pipelines = new ArrayList<>(); protected PageProcessor pageProcessor; @@ -86,11 +86,11 @@ public class Spider implements Runnable, Task { protected boolean exitWhenComplete = true; - protected final static int STAT_INIT = 0; + protected static final int STAT_INIT = 0; - protected final static int STAT_RUNNING = 1; + protected static final int STAT_RUNNING = 1; - protected final static int STAT_STOPPED = 2; + protected static final int STAT_STOPPED = 2; protected boolean spawnUrl = true; @@ -248,7 +248,7 @@ public Spider setPipelines(List pipelines) { * @return this */ public Spider clearPipeline() { - pipelines = new ArrayList(); + pipelines = new ArrayList<>(); return this; } @@ -439,7 +439,6 @@ private void onDownloadSuccess(Request request, Page page) { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); - return; } private void onDownloaderFail(Request request) { @@ -545,7 +544,7 @@ protected CollectorPipeline getCollectorPipeline() { public T get(String url) { List urls = WMCollections.newArrayList(url); List resultItemses = getAll(urls); - if (resultItemses != null && resultItemses.size() > 0) { + if (resultItemses != null && !(resultItemses.isEmpty())) { return resultItemses.get(0); } else { return null; @@ -678,7 +677,7 @@ public Status getStatus() { public enum Status { - Init(0), Running(1), Stopped(2); + INIT(0), RUNNING(1), STOPPED(2); private Status(int value) { this.value = value; @@ -697,7 +696,7 @@ public static Status fromValue(int value) { } } //default value - return Init; + return INIT; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index c27292d09..a71a7d876 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -32,7 +32,7 @@ public Html download(String url) { */ public Html download(String url, String charset) { Page page = download(new Request(url), Site.me().setCharset(charset).toTask()); - return (Html) page.getHtml(); + return page.getHtml(); } protected void onSuccess(Request request) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 49217e111..b2a39e910 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -33,7 +33,7 @@ public class HttpClientDownloader extends AbstractDownloader { private Logger logger = LoggerFactory.getLogger(getClass()); - private final Map httpClients = new HashMap(); + private final Map httpClients = new HashMap<>(); private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java index 7d3b30785..afd6f88b0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java @@ -16,6 +16,8 @@ */ public class HttpRequestBody implements Serializable { + private static final String ILL_ENC = "illegal encoding "; + private static final long serialVersionUID = 5659170945717023595L; public static abstract class ContentType { @@ -68,7 +70,7 @@ public static HttpRequestBody json(String json, String encoding) { try { return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); } catch (UnsupportedEncodingException e) { - throw new IllegalArgumentException("illegal encoding " + encoding, e); + throw new IllegalArgumentException(ILL_ENC + encoding, e); } } @@ -76,7 +78,7 @@ public static HttpRequestBody xml(String xml, String encoding) { try { return new HttpRequestBody(xml.getBytes(encoding), ContentType.XML, encoding); } catch (UnsupportedEncodingException e) { - throw new IllegalArgumentException("illegal encoding " + encoding, e); + throw new IllegalArgumentException(ILL_ENC + encoding, e); } } @@ -92,7 +94,7 @@ public static HttpRequestBody form(Map params, String encoding){ try { return new HttpRequestBody(URLEncodedUtils.format(nameValuePairs, encoding).getBytes(encoding), ContentType.FORM, encoding); } catch (UnsupportedEncodingException e) { - throw new IllegalArgumentException("illegal encoding " + encoding, e); + throw new IllegalArgumentException(ILL_ENC + encoding, e); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java index ccf00a466..85852bcae 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java @@ -19,6 +19,7 @@ */ public abstract class CharsetUtils { + private static final String CHR = "charset"; private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class); public static String detectCharset(String contentType, byte[] contentBytes) throws IOException { @@ -40,9 +41,9 @@ public static String detectCharset(String contentType, byte[] contentBytes) thro for (Element link : links) { // 2.1、html4.01 String metaContent = link.attr("content"); - String metaCharset = link.attr("charset"); - if (metaContent.indexOf("charset") != -1) { - metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); + String metaCharset = link.attr(CHR); + if (metaContent.indexOf(CHR) != -1) { + metaContent = metaContent.substring(metaContent.indexOf(CHR), metaContent.length()); charset = metaContent.split("=")[1]; break; } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java index 3f2de70c5..e2759e563 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java @@ -19,6 +19,7 @@ */ public class ZipCodePageProcessor implements PageProcessor { + private Site site = Site.me().setCharset("gb2312") .setSleepTime(100); diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index d53630fe7..4de3e01ec 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -63,54 +63,47 @@ class WebDriverPool { * @throws IOException */ public void configure() throws IOException { - try{ - // Read config file - sConfig = new Properties(); - String configFile = DEFAULT_CONFIG_FILE; - if (System.getProperty("selenuim_config")!=null){ - configFile = System.getProperty("selenuim_config"); + + // Read config file + sConfig = new Properties(); + String configFile = DEFAULT_CONFIG_FILE; + if (System.getProperty("selenuim_config")!=null){ + configFile = System.getProperty("selenuim_config"); + } + sConfig.load(new FileReader(configFile)); + + // Prepare capabilities + sCaps = new DesiredCapabilities(); + sCaps.setJavascriptEnabled(true); + sCaps.setCapability("takesScreenshot", false); + + String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS); + + // Fetch PhantomJS-specific configuration parameters + if (driver.equals(DRIVER_PHANTOMJS)) { + // "phantomjs_exec_path" + if (sConfig.getProperty("phantomjs_exec_path") != null) { + sCaps.setCapability( + PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, + sConfig.getProperty("phantomjs_exec_path")); + } else { + throw new IOException( + String.format( + "Property '%s' not set!", + PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY)); } - FileReader configFileReader = new FileReader(configFile) - sConfig.load(configFileReader); - - // Prepare capabilities - sCaps = new DesiredCapabilities(); - sCaps.setJavascriptEnabled(true); - sCaps.setCapability("takesScreenshot", false); - String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS); - // Fetch PhantomJS-specific configuration parameters - if (driver.equals(DRIVER_PHANTOMJS)) { - // "phantomjs_exec_path" - if (sConfig.getProperty("phantomjs_exec_path") != null) { - sCaps.setCapability( - PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, - sConfig.getProperty("phantomjs_exec_path")); - } else { - throw new IOException( - String.format( - "Property '%s' not set!", - PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY)); - } - // "phantomjs_driver_path" - if (sConfig.getProperty("phantomjs_driver_path") != null) { - System.out.println("Test will use an external GhostDriver"); - sCaps.setCapability( - PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_PATH_PROPERTY, - sConfig.getProperty("phantomjs_driver_path")); - } else { - System.out - .println("Test will use PhantomJS internal GhostDriver"); - } + // "phantomjs_driver_path" + if (sConfig.getProperty("phantomjs_driver_path") != null) { + System.out.println("Test will use an external GhostDriver"); + sCaps.setCapability( + PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_PATH_PROPERTY, + sConfig.getProperty("phantomjs_driver_path")); + } else { + System.out + .println("Test will use PhantomJS internal GhostDriver"); } - }catch(Exception e){ - throw new IOException("Can not load config file properly"); - - }finally{ - configFileReader.close(); } - - // Disable "web-security", enable all possible "ssl-protocols" and // "ignore-ssl-errors" for PhantomJSDriver // sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, new