I'm trying to scrape some Web pages after the Javascript code has been executed. I need to do that in parallel because I have to scrape over 500 pages.
So far I tried on this way:
This is the class used to Scrape
public class Class1
{
private HtmlAgilityPack.HtmlDocument Doc;
private string page1;
private string page2;
private string page3;
public async Task<string> LoadDynamicPage(string url, CancellationToken token, WebBrowser webBrowser)
{
// navigate and await DocumentCompleted
var tcs = new TaskCompletionSource<bool>();
WebBrowserDocumentCompletedEventHandler handler = (s, arg) =>
tcs.TrySetResult(true);
using (token.Register(() => tcs.TrySetCanceled(), useSynchronizationContext: true))
{
webBrowser.DocumentCompleted += handler;
try
{
webBrowser.Navigate(url);
await tcs.Task; // wait for DocumentCompleted
}
finally
{
webBrowser.DocumentCompleted -= handler;
}
}
// get the root element
var documentElement = webBrowser.Document.GetElementsByTagName("html")[0];
// poll the current HTML for changes asynchronosly
var html = documentElement.OuterHtml;
while (true)
{
// wait asynchronously, this will throw if cancellation requested
await Task.Delay(500, token);
// continue polling if the WebBrowser is still busy
if (webBrowser.IsBusy)
continue;
var htmlNow = documentElement.OuterHtml;
if (html == htmlNow)
break; // no changes detected, end the poll loop
html = htmlNow;
}
// consider the page fully rendered
token.ThrowIfCancellationRequested();
return html;
}
private async Task InitializeHTML(string html, WebBrowser webBrowser)
{
var cts = new CancellationTokenSource(TimeSpan.FromSeconds(3600));
string code = await LoadDynamicPage(html, cts.Token, webBrowser);
Doc = new HtmlAgilityPack.HtmlDocument();
Doc.LoadHtml(code);
}
public async Task<Class1> GetData(string standardPage, WebBrowser webBrowser)
{
Class1 obj = new Class1();
try
{
// Navigate to standardPage
await InitializeHTML(standardPage, webBrowser);
// ---
// Do some operations
// ---
// Navigate to page1
await InitializeHTML(page1, webBrowser);
// ---
// Do some operations
// ---
// Navigate to page2
await InitializeHTML(page2, webBrowser);
// ---
// Do some operations
// ---
// Navigate to page3
await InitializeHTML(page3, webBrowser);
// ---
// Do some operations
// ---
}
catch (Exception exc)
{
Debug.WriteLine(exc.Message);
}
return obj;
}
}
So on LoadDynamicPage function, I use a Web Browser to get the HTML code after the Javascript has been executed.
Then InitializeHTML is used to create a HTML Document with HtmlAgilityPack.
Finally I have the Scrape Method (GetData) where I scrape several subpages of the website passed as parameter and it returns a Class1 object asynchronously.
Here I call the GetData function:
public class Class2
{
public async void DoWork()
{
// Web Pages List
List<string> pages = new List<string> {"www.example.com", "www.example2.com" };
WebBrowser webBrowser = new WebBrowser();
List<Task<Class1>> tasks = new List<Task<Class1>>();
// Class1 Objects List to be filled with Task results
List<Class1> objects = new List<Class1>();
// Temp object to access Class1.GetData Method
Class1 tmpObj = new Class1();
for (int i = 0; i < pages.Count; i++)
{
tasks.Add(tmpObj.GetData(pages[i], webBrowser));
}
var results = await Task.WhenAll(tasks.ToArray());
}
}
There are no errors but I get always the same HTML code (same web page) on the Task results at the end.
Where did I go wrong?
Thanks for the help.
Aucun commentaire:
Enregistrer un commentaire