Hello everyone, I'm implementing multi threaded web crawler and stuck in a phase where my application hangs, i'm sharing the code which is the creating hanging problem. Multiple threads coming in crawlink method and requests for header checking and then goes to the httpwebrequest portion of code which is the base of application hangs,
private void CrawlLink(String currentUrl)
{
string response = GetServer(ref obj);
obj.Address = URL;
if (response.Equals("OK"))
{
if (CurrentUrl.Contains(URL))
{
if (!obj.Type.Equals("text/html"))
{
stopwatch.Stop();
hours = stopwatch.Elapsed.Hours;
minutes = stopwatch.Elapsed.Minutes;
second = stopwatch.Elapsed.Seconds;
int mSecond = stopwatch.Elapsed.Milliseconds;
obj.Duration = hours + ":" + minutes + ":" + second + ":" + mSecond;
minutes = 0; second = 0; hours = 0;
UpdatePendingLinks();
UpdateListViewItems3(obj);
RefreshList();
}
else
{
var request = (HttpWebRequest)WebRequest.Create( new Uri(CurrentUrl));
request.UserAgent = "Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))";
request.BeginGetResponse(r =>
{
var httpRequest = (HttpWebRequest)r.AsyncState;
using (var httpResponse = (HttpWebResponse)httpRequest.EndGetResponse(r))
{
using (var reader = new StreamReader(httpResponse.GetResponseStream()))
{
HtmlAgilityPack.HtmlDocument htmlDocument = new HtmlAgilityPack.HtmlDocument();
HtmlWeb web = new HtmlWeb();
try
{
htmlDocument.Load(reader);
IEnumerable<HtmlNode> ImageSource = htmlDocument.DocumentNode.SelectNodes("//img[@src]");
IEnumerable<HtmlNode> ScriptSource = htmlDocument.DocumentNode.SelectNodes("//script[@src]");
IEnumerable<HtmlNode> LinksHref = htmlDocument.DocumentNode.SelectNodes("//link[@href]");
IEnumerable<HtmlNode> AnchorHref = htmlDocument.DocumentNode.SelectNodes("//a[@href]");
if (LinksHref != null)
HreFilter(LinksHref);
if (AnchorHref != null)
HreFilter(AnchorHref);
if (ImageSource != null)
SrcFilter(ImageSource);
if (ScriptSource != null)
SrcFilter(ScriptSource);
if (htmlDocument.DocumentNode.SelectSingleNode("//title") != null)
obj.Title = htmlDocument.DocumentNode.SelectSingleNode("//title").InnerText;
if (htmlDocument.DocumentNode.SelectSingleNode("//meta[@name='description']") != null)
obj.Desc = htmlDocument.DocumentNode.SelectSingleNode("//meta[@name='description']").Attributes["content"].Value;
}
catch (Exception e)
{
obj.Error = e.Message;
}
}
}
}, request);
CrawlFurtherLinks(ref obj);
stopwatch.Stop();
hours = stopwatch.Elapsed.Hours;
minutes = stopwatch.Elapsed.Minutes;
second = stopwatch.Elapsed.Seconds;
int mSecond = stopwatch.Elapsed.Milliseconds;
obj.Duration = hours + ":" + minutes + ":" + second + ":" + mSecond;
obj.Level = "1";
UpdatePendingLinks();
if (!string.IsNullOrWhiteSpace(obj.Error))
UpdateListViewItems2x(obj);
else
UpdateListViewItems3(obj);
RefreshList();
minutes = 0; second = 0; hours = 0;
}
}
else
{
stopwatch.Stop();
hours = stopwatch.Elapsed.Hours;
minutes = stopwatch.Elapsed.Minutes;
second = stopwatch.Elapsed.Seconds;
int mSecond = stopwatch.Elapsed.Milliseconds;
obj.Duration = hours + ":" + minutes + ":" + second + ":" + mSecond;
minutes = 0; second = 0; hours = 0;
UpdatePendingLinks();
UpdateListViewItems3(obj);
RefreshList();
}
}
else
{
UpdateListViewItems2x(obj);
RefreshList();
}
}
}
basically this function takes a url, sends the url to getserver() method which return whether it works or not, next it checks whether it is inner url and page type is text/html then it goes to the section where httpwebrequest is implemented.
the problem is more than one thread comes in this method and accesses httpwebrequest and httpwebresponse hangs the app until it downloads the url. I'm sharing the getServer method too,
public string GetServer(string currentUrl)
{
string response = "Null";
try
{
request = (HttpWebRequest)HttpWebRequest.Create(new UriBuilder(currentUrl).Uri);
request.AllowAutoRedirect = false;
request.Timeout = 1500;
request.Method = "HEAD";
HttpWebResponse resp = (HttpWebResponse)request.GetResponse();
HttpStatusCode scode = resp.StatusCode;
CurrentUrl = resp.ResponseUri.AbsoluteUri;
obj.StatusCode = (int)resp.StatusCode;
response = "OK";
resp.Close();
resp.Dispose();
return response;
}
catch (WebException e)
{
obj.Error = e.Message;
response = "Null";
}
return response;
}
Kindly suggest me where I've to make modifications in the code so my application works without hanging. I more noticeable thing is that, I'm using threads, that means the UI will stay responsive and my threads are doing well but at the phase of httpwebrequest, the application freezes for a while and then respond again.