Click here to Skip to main content
15,891,976 members
Please Sign up or sign in to vote.
0.00/5 (No votes)
See more:
Hi , im trying to extract data from that website , first im trying to get categories and from each categories i am getting subcategories and then from each subcategory im trying to read and extract some text.

Im using webclient that im facing some strange problem that sometimes i doesnt read data sometimes it reads data.

Sometimes i get 30 categories but get subcategories from only 10 sometimes i get 10 categories and read all subcategories using web client.

how to solve this problem ?
followin is he code :
C#
public Extract(string url)
       {

           client = new WebClient();
           strm = client.OpenRead(url);
           strrdr = new StreamReader(strm, Encoding.ASCII);
           categorylines = new List<string>();
           subcategorylines = new List<string>[30];

           while (strrdr.Peek() > 0)
           {

               string line = strrdr.ReadLine();
               line = line.Replace("\n", String.Empty);
               line = line.Replace("\t", string.Empty);
               line = line.Replace("\r", string.Empty);
               line = line.Replace("\\", "");
               //System.Threading.Thread.Sleep(100);
               ExtractLines(line);

           }
           strrdr.Close();
       }

       public void ExtractSubcategories() {
           string url = null;
           string name = null;

           for (int i = 0; i < Categories.Category.Count; i++)
           {
               foreach (var item in subcategorylines[i])
               {
                   find1 = new Regex(@"href="".+"">", RegexOptions.IgnoreCase);
                   find2 = new Regex(@">.+<\/a>", RegexOptions.IgnoreCase);
                   m1 = find1.Match(item);
                   m2 = find2.Match(item);
                   if (m1.Success)
                   {
                       url = item.Substring(m1.Index+6, m1.Length - 8);
                       url = "www.codeproject.com" + url;
                   }
                   if (m2.Success)
                   {
                       name = item.Substring(m2.Index + 1, m2.Length - 5);

                   }
                   ArticleSubCategory sub = new ArticleSubCategory(name, url);
                   Categories.Category[i].SubCategories.Add(sub);
               }


           }
       }
       public void ExtractCategory() {
               string url = null;
               string name = null;
               tblcategories = new DataTable();
               foreach (var item in categorylines)
               {
                   url = GetLine(@"href="".+"">", 6, 8, item);
                   name = GetLine(@">.+<\/a>", 1, 5, item);

                   Categories.Add(new ArticleCategory(name, url));


               }
       }

       public string GetLine(string regex, int start, int end,string line)
       {
           find1 = new Regex(regex, RegexOptions.IgnoreCase);
           m1 = find1.Match(line);
           if (m1.Success)
           {
               return line.Substring(m1.Index+start,m1.Length-end);
           }
           else return null;
       }

       //public void ExtractArticle() { }

       public void ExtractLines(string line)
       {
           string a;

           if ((a = GetLine(categoryRegex, 0, 0, line)) != null)
           {
               tmp++;
               categorylines.Add(a);
               subcategorylines[tmp] = new List<string>();
           }
           if((a=GetLine(subcategoryRegex,0,0,line))!=null)
           {

               subcategorylines[tmp].Add(a);

           }
           //find1= new Regex(categoryRegex, RegexOptions.IgnoreCase);
           //find2= new Regex(subcategoryRegex, RegexOptions.IgnoreCase);
           //m1 = find1.Match(line);
           //m2= find2.Match(line);
           //if (m1.Success)
           //{
           //    tmp = tmp + 1;
           //    categorylines.Add(m1.Value);
           //    subcategorylines[tmp] = new List<string>();
           //}
           //else
           //{
           //    if (m2.Success)
           //    {
           //        subcategorylines[tmp].Add(m2.Value);
           //    }
           //}

       }

       public void ExtractArticleMeta() {

           tmp = 0;
           string a;
           foreach (var item in Categories.Category)
           {
               articleLines[tmp] = new List<string>();
               foreach (var subcat in item.SubCategories)
               {
                   client = new WebClient();
                   strm = client.OpenRead("http://"+subcat.Url);
                   strrdr = new StreamReader(strm, Encoding.ASCII);
                   while (strrdr.Peek()>0)
                   {
                       string line = strrdr.ReadLine();
                       line = line.Replace("\n", String.Empty);
                       line = line.Replace("\t", string.Empty);
                       line = line.Replace("\r", string.Empty);
                       line = line.Replace("\\", "");
                       if ((a= GetLine(articleregex,0,0,line))!=null)
                       {
                           articleLines[tmp].Add(a);
                       }

                   }


               }
               tmp = tmp + 1;
           }

       }
Posted
Updated 23-Feb-13 6:21am
v2
Comments
Sandeep Mewara 23-Feb-13 11:31am    
If you need specific help, you should share the related snippet. Update the code so that members can see and suggest. Use ImproveQuestion link to edit/update the question.
Shan Ali Khan 23-Feb-13 12:22pm    
actually there no problem in code sometimes it throw exception in one line and sometime it throw exception on other line, even sometime it throw exception on connect to website. I have uploaded the code please have a look.

1 solution

Please unit test your server methods though you can understand that the return result is wrong(make sure that is not business logic). If it is third party service then you can not do that. Check your exception handling code. If any exception raised then log that and investigate the log data. Make sure any exception is not suppressed.
 
Share this answer
 
Comments
Shan Ali Khan 23-Feb-13 12:23pm    
actually there no problem in code sometimes it throw exception in one line and sometime it throw exception on other line, even sometime it throw exception on connect to website. I have uploaded the code please have a look.
it is no third party code i have written code.
S. M. Ahasan Habib 24-Feb-13 0:26am    
Please show the exception lines and message though i can address the problem.

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900