samedi 25 janvier 2020

Extract email address from a website from each link inside DOM of page

I Want to develope an app I give Url of a specific website to it,and it extract all links from that Web page. For each extracted link I want to get the HTML content. I am based in the concept of deep crawling. My purpose is to get all email addresses of website. Below is my source code:

 static string ExtractEmails(string data)
 {

            //instantiate with this pattern 
            Regex emailRegex = new Regex(@"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*", RegexOptions.IgnoreCase);
            //find items that matches with our pattern
            MatchCollection emailMatches = emailRegex.Matches(data);

            //StringBuilder sb = new StringBuilder();
            string s = "";
            foreach (Match emailMatch in emailMatches)
            {
                //sb.AppendLine(emailMatch.Value);
                s += emailMatch.Value + ",";
            }
            return s;
 }

     static readonly List<ParsResult> _results = new List<ParsResult>();
        static Int32 _maxDepth = 4;
        static String Foo(String urlToCheck = null, Int32 depth = 0, ParsResult parent = null)
        {
            string email = "";
            if (depth >= _maxDepth) return email;
            String html;
            using (var wc = new WebClient())
                html = wc.DownloadString(urlToCheck ?? parent.Url);

            var doc = new HtmlDocument();
            doc.LoadHtml(html);
            var aNods = doc.DocumentNode.SelectNodes("//a");
            if (aNods == null || !aNods.Any()) return email;
            foreach (var aNode in aNods)
            {
                var url = aNode.Attributes["href"];
                if (url == null)
                    continue;

                var wc2 = new WebClient();
                String html2 = wc2.DownloadString(url.Value);
                email = ExtractEmails(html2);
                Console.WriteLine(email);
                var result = new ParsResult
                {
                    Depth = depth,
                    Parent = parent,
                    Url = url.Value
                };
                _results.Add(result);
                Console.WriteLine("{0} - {1}", depth, result.Url);
                Foo(depth: depth + 1, parent: result);
                return email;
            }
            return email;
        }

static void Main(string[] args)
{
    String res = Foo("http://www.mobileridoda.com", 0);
    Console.WriteLine("emails " + res);
}

I want to dispaly in console all emails extracted by all pages of all links that are inside DOM of Main page, But it dispalys no emails in console




Aucun commentaire:

Enregistrer un commentaire