Click here to Skip to main content
15,891,033 members
Please Sign up or sign in to vote.
1.00/5 (2 votes)
See more:
I'm desiging this app,

private string fetch(string add, string url)
        {
            string fet = string.Empty;
            if (add.Length > 0)
            {
                HtmlWeb htmlweb = new HtmlWeb();
                List<string> a = new List<string>();
               
                HtmlAgilityPack.HtmlDocument doc = htmlweb.Load(add);
                if (doc != null && doc.DocumentNode != null)
                {
                    HtmlNodeCollection link = doc.DocumentNode.SelectNodes("//a[@href]");
                    if (link != null)
                    {
                        foreach (HtmlNode li in link)
                        {
                            HtmlAttribute att = li.Attributes["href"];
                            if (att.Value.StartsWith(url) == true)
                            {
                                fet += att.Value + ",";
                            }
                            else if (att.Value.StartsWith("/") == true)
                            {
                                fet += url + att.Value + ",";
                            }
                            else if (att.Value.Contains("#comment") == true || att.Value.Contains("#respond") == true || att.Value.Contains("/refer/"))
                            {

                            }
                        }
                    }
                }
            }
            else
            {
                return "";
            }
            return fet;
        }


C#
private void button1_Click(object sender, EventArgs e)
        {
            HtmlWeb hw = new HtmlWeb();
            List<string> a = new List<string>();
            List<string> link2 = new List<string>();
            HtmlAgilityPack.HtmlDocument doc = hw.Load(textBox1.Text);

            foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//a[@href]"))
            {
                HtmlAttribute att = link.Attributes["href"];
                
                a.Add(att.Value);
            }
            //using (StreamWriter writer = new StreamWriter(@"C:\Users\M Adeel\Documents\Visual Studio 2010\Projects\CrawlingLinks\Projects\test.txt"))
            //{
                foreach (string s in a)
                {
                    if (string.IsNullOrWhiteSpace(s) == true)
                    {

                    }
                    else
                    {
                        if (s.StartsWith(textBox1.Text) == true)
                        {
                            link2.Add(s);
                        }
                        else if (s.StartsWith("/") == true)
                        {
                            link2.Add(textBox1.Text + s);
                        }
                    }

                }
                string ret = string.Empty, ret2= string.Empty;
                foreach (string s in link2)
                {
                    ret+=fetch(s, textBox1.Text);
                }
                string[] ar = ret.Split(',');
                string[] q1 = ar.Distinct().ToArray();
                //foreach (string s in q1)
                //{
                //    ret2 += fetch(s, textBox1.Text);
                //}
                //string[] q2 = ar.Distinct().ToArray();
                
            using (StreamWriter writer = new StreamWriter(@"C:\Users\M Adeel\Documents\Visual Studio 2010\Projects\CrawlingLinks\Projects\test.txt"))
                {
                    foreach (string s in q1)
                    {
                        writer.Write(s + "\r\n");
                    }
                }
        }

Let me explain this code first, I'm using htmlagility pack for extracting URLs from a website. It almost works but not fully. The button code first fetches the data of a given url within a text box using htmlagilitypack and saves the links in a list, it then further manipulates the links. In a foreach loop i call the function fetch with 2 parameters namely add and url, add is the inner link of some site like http://www.example.com/page1/ etc and url is http://www.example.com which is entered by the user. The problem here is that, it fetches the links but i need the code to be modified so, it can recursively call inner pages of the website and so, crawl it until the last inner url of website, How can i modify it? my logic is poor regarding this situation. I gave it a domain name like http://www.example.com it goes to some page like http://www.example.com/blog/page1 and /blog/page2 and directly goes to the last page /blog/page144 without fetching one by one, actually 3 pages are mentionaed at the blog page of that website, so i think, recursive call should be best rather than making inefficient and time consuming code. I hope you'll understand my query.
It doesn't crawl a website as a whole. I've been looking around for some solution but unable to find any.
Regards!
Posted

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900