lundi 3 juillet 2017

why my crawledURL is null?

I am following a tutorial to create a web crawler in java. When I run the code my crawledURL is null. *** Malformed URL :null in a infinite loop.

Can anyone explain to me why is this happening?

Here is the whole code:

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.*;
import java.net.*;

public class WebCrawler {

public static Queue<String> Queue = new LinkedList<>();
public static Set<String> marked = new HashSet<>();
public static String regex = "http[s]://(\\w+\\.)*(\\w+)";

public static void bfsAlgorithm(String root) throws IOException {

    Queue.add(root);

    while (!Queue.isEmpty()) {

        String crawledURL = Queue.poll();
        System.out.println("\n=== Site crawled : " + crawledURL + "===");

        //Limiting to a 100 websites here 

        if(marked.size() > 100)
            return;

        boolean ok = false;
        URL url = null;
        BufferedReader br = null;

        while (!ok) {
            try {
                url = new URL(crawledURL);
                br = new BufferedReader(new InputStreamReader(url.openStream()));
                ok = true;

            } catch (MalformedURLException e) {
                System.out.println("*** Malformed URL :" + crawledURL);
                crawledURL = Queue.poll();
                ok = false;

            } catch (IOException ioe) { 
                System.out.println("*** IOException for URL :" + crawledURL);
                crawledURL = Queue.poll();
                ok = false;


        }

    }

        StringBuilder sb = new StringBuilder();

        while((crawledURL = br.readLine()) != null) {
            sb.append(crawledURL);
        }

        crawledURL = sb.toString();
        Pattern pattern = Pattern.compile(regex);
        Matcher matcher = pattern.matcher(crawledURL);


        while (matcher.find()){

            String w = matcher.group();

            if (!marked.contains(w)) {
                marked.add(w);
                System.out.println("Site added for crawling : " + w);
                Queue.add(w);
            }
        }

    }

}


public static void showResults() {
    System.out.println("\n\nResults :");
    System.out.print("Web sites craweled: " + marked.size() + "\n");

    for (String s : marked) {
        System.out.println("* " + s);
    }

}

public static void main(String[] args) {

    try {

        bfsAlgorithm("http://ift.tt/2sBdMQK");
        showResults();

    } catch (IOException e) {

        //TODO Auto-generated catch block
        e.printStackTrace();
    }
}

}




Aucun commentaire:

Enregistrer un commentaire