I am following a tutorial to create a web crawler in java. When I run the code my crawledURL is null. *** Malformed URL :null in a infinite loop.
Can anyone explain to me why is this happening?
Here is the whole code:
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.*;
import java.net.*;
public class WebCrawler {
public static Queue<String> Queue = new LinkedList<>();
public static Set<String> marked = new HashSet<>();
public static String regex = "http[s]://(\\w+\\.)*(\\w+)";
public static void bfsAlgorithm(String root) throws IOException {
Queue.add(root);
while (!Queue.isEmpty()) {
String crawledURL = Queue.poll();
System.out.println("\n=== Site crawled : " + crawledURL + "===");
//Limiting to a 100 websites here
if(marked.size() > 100)
return;
boolean ok = false;
URL url = null;
BufferedReader br = null;
while (!ok) {
try {
url = new URL(crawledURL);
br = new BufferedReader(new InputStreamReader(url.openStream()));
ok = true;
} catch (MalformedURLException e) {
System.out.println("*** Malformed URL :" + crawledURL);
crawledURL = Queue.poll();
ok = false;
} catch (IOException ioe) {
System.out.println("*** IOException for URL :" + crawledURL);
crawledURL = Queue.poll();
ok = false;
}
}
StringBuilder sb = new StringBuilder();
while((crawledURL = br.readLine()) != null) {
sb.append(crawledURL);
}
crawledURL = sb.toString();
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(crawledURL);
while (matcher.find()){
String w = matcher.group();
if (!marked.contains(w)) {
marked.add(w);
System.out.println("Site added for crawling : " + w);
Queue.add(w);
}
}
}
}
public static void showResults() {
System.out.println("\n\nResults :");
System.out.print("Web sites craweled: " + marked.size() + "\n");
for (String s : marked) {
System.out.println("* " + s);
}
}
public static void main(String[] args) {
try {
bfsAlgorithm("http://ift.tt/2sBdMQK");
showResults();
} catch (IOException e) {
//TODO Auto-generated catch block
e.printStackTrace();
}
}
}
Aucun commentaire:
Enregistrer un commentaire