diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index c11df693c..f822b5602 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -49,12 +49,10 @@ public class Page { private byte[] bytes; - private List targetRequests = new ArrayList(); + private List targetRequests = new ArrayList<>(); private String charset; - public Page() { - } public static Page fail(){ Page page = new Page(); @@ -105,9 +103,9 @@ public Json getJson() { /** * @param html html - * @deprecated since 0.4.0 * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ + public void setHtml(Html html) { this.html = html; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 886e74a92..d9117ce2f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -62,7 +62,7 @@ public class Spider implements Runnable, Task { protected Downloader downloader; - protected List pipelines = new ArrayList(); + protected List pipelines = new ArrayList<>(); protected PageProcessor pageProcessor; @@ -86,11 +86,11 @@ public class Spider implements Runnable, Task { protected boolean exitWhenComplete = true; - protected final static int STAT_INIT = 0; + protected static final int STAT_INIT = 0; - protected final static int STAT_RUNNING = 1; + protected static final int STAT_RUNNING = 1; - protected final static int STAT_STOPPED = 2; + protected static final int STAT_STOPPED = 2; protected boolean spawnUrl = true; @@ -246,7 +246,7 @@ public Spider setPipelines(List pipelines) { * @return this */ public Spider clearPipeline() { - pipelines = new ArrayList(); + pipelines = new ArrayList<>(); return this; } @@ -313,7 +313,8 @@ public void run() { // wait until new url added waitNewUrl(); } else { - threadPool.execute(new Runnable() { + threadPool.execute( + new Runnable() { @Override public void run() { try { @@ -427,7 +428,6 @@ private void onDownloadSuccess(Request request, Page page) { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); - return; } private void onDownloaderFail(Request request) { @@ -458,6 +458,8 @@ protected void sleep(int time) { Thread.sleep(time); } catch (InterruptedException e) { logger.error("Thread interrupted when sleep",e); + //restore interrupted thread + Thread.currentThread().interrupt(); } } @@ -564,6 +566,7 @@ private void waitNewUrl() { newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { logger.warn("waitNewUrl - interrupted, error {}", e); + Thread.currentThread().interrupt(); } finally { newUrlLock.unlock(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java index ff8e26998..e4ffd38d1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java @@ -15,7 +15,9 @@ */ @Experimental public class SmartContentSelector implements Selector { - + /*** + * Empty/ default constructor for SmartContentSelector + */ public SmartContentSelector() { } @@ -33,7 +35,7 @@ public String select(String html) { int start; int end; StringBuilder text = new StringBuilder(); - ArrayList indexDistribution = new ArrayList(); + ArrayList indexDistribution = new ArrayList<>(); lines = Arrays.asList(html.split("\n")); @@ -47,39 +49,42 @@ public String select(String html) { } start = -1; end = -1; - boolean boolstart = false, boolend = false; + boolean boolstart = false; + boolean boolend = false; text.setLength(0); - - for (int i = 0; i < indexDistribution.size() - 1; i++) { - if (indexDistribution.get(i) > threshold && ! boolstart) { - if (indexDistribution.get(i+1).intValue() != 0 + + int i=0; + while (i < indexDistribution.size() - 1) { + + if ((indexDistribution.get(i) > threshold && ! boolstart) + && (indexDistribution.get(i+1).intValue() != 0 || indexDistribution.get(i+2).intValue() != 0 - || indexDistribution.get(i+3).intValue() != 0) { + || indexDistribution.get(i+3).intValue() != 0) ){ boolstart = true; start = i; - continue; + i++; } - } - if (boolstart) { - if (indexDistribution.get(i).intValue() == 0 - || indexDistribution.get(i+1).intValue() == 0) { + + if ((boolstart) && (indexDistribution.get(i).intValue() == 0 + || indexDistribution.get(i+1).intValue() == 0) ){ end = i; boolend = true; } - } + + StringBuilder tmp = new StringBuilder(); if (boolend) { - //System.out.println(start+1 + "\t\t" + end+1); for (int ii = start; ii <= end; ii++) { - if (lines.get(ii).length() < 5) continue; + if (lines.get(ii).length() < 5) i++; tmp.append(lines.get(ii) + "\n"); } String str = tmp.toString(); - //System.out.println(str); - if (str.contains("Copyright") ) continue; + + if (str.contains("Copyright")) i++; text.append(str); boolstart = boolend = false; } + i++; } return text.toString(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java index 79b9efece..721401f8b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java @@ -11,6 +11,14 @@ public class FilePersistentBase { protected String path; + + public FilePersistentBase() { + setPath("/data/webmagic/"); + } + + public FilePersistentBase(String path) { + setPath(path); + } public static String PATH_SEPERATOR = "/"; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java index 0db9b819d..22d818217 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java @@ -28,11 +28,11 @@ public class FilePageModelPipeline extends FilePersistentBase implements PageMod * new JsonFilePageModelPipeline with default path "/data/webmagic/" */ public FilePageModelPipeline() { - setPath("/data/webmagic/"); + super(); } public FilePageModelPipeline(String path) { - setPath(path); + super(path); } @Override diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java index 7a7f80a25..7b3b3035c 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java @@ -29,11 +29,11 @@ public class JsonFilePageModelPipeline extends FilePersistentBase implements Pag * new JsonFilePageModelPipeline with default path "/data/webmagic/" */ public JsonFilePageModelPipeline() { - setPath("/data/webmagic/"); + super(); } public JsonFilePageModelPipeline(String path) { - setPath(path); + super(path); } @Override diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java index 3d416964b..dafb7ce70 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java @@ -12,7 +12,7 @@ */ public abstract class IPUtils { - public static String getFirstNoLoopbackIPAddresses() throws SocketException { + public static String getFirstNoLoopbackIPAddresses() throws SocketException, NullPointerException{ Enumeration networkInterfaces = NetworkInterface.getNetworkInterfaces();