Hi , im trying to extract data from that website , first im trying to get categories and from each categories i am getting subcategories and then from each subcategory im trying to read and extract some text.
Im using webclient that im facing some strange problem that sometimes i doesnt read data sometimes it reads data.
Sometimes i get 30 categories but get subcategories from only 10 sometimes i get 10 categories and read all subcategories using web client.
how to solve this problem ?
followin is he code :
public Extract(string url)
{
client = new WebClient();
strm = client.OpenRead(url);
strrdr = new StreamReader(strm, Encoding.ASCII);
categorylines = new List<string>();
subcategorylines = new List<string>[30];
while (strrdr.Peek() > 0)
{
string line = strrdr.ReadLine();
line = line.Replace("\n", String.Empty);
line = line.Replace("\t", string.Empty);
line = line.Replace("\r", string.Empty);
line = line.Replace("\\", "");
ExtractLines(line);
}
strrdr.Close();
}
public void ExtractSubcategories() {
string url = null;
string name = null;
for (int i = 0; i < Categories.Category.Count; i++)
{
foreach (var item in subcategorylines[i])
{
find1 = new Regex(@"href="".+"">", RegexOptions.IgnoreCase);
find2 = new Regex(@">.+<\/a>", RegexOptions.IgnoreCase);
m1 = find1.Match(item);
m2 = find2.Match(item);
if (m1.Success)
{
url = item.Substring(m1.Index+6, m1.Length - 8);
url = "www.codeproject.com" + url;
}
if (m2.Success)
{
name = item.Substring(m2.Index + 1, m2.Length - 5);
}
ArticleSubCategory sub = new ArticleSubCategory(name, url);
Categories.Category[i].SubCategories.Add(sub);
}
}
}
public void ExtractCategory() {
string url = null;
string name = null;
tblcategories = new DataTable();
foreach (var item in categorylines)
{
url = GetLine(@"href="".+"">", 6, 8, item);
name = GetLine(@">.+<\/a>", 1, 5, item);
Categories.Add(new ArticleCategory(name, url));
}
}
public string GetLine(string regex, int start, int end,string line)
{
find1 = new Regex(regex, RegexOptions.IgnoreCase);
m1 = find1.Match(line);
if (m1.Success)
{
return line.Substring(m1.Index+start,m1.Length-end);
}
else return null;
}
public void ExtractLines(string line)
{
string a;
if ((a = GetLine(categoryRegex, 0, 0, line)) != null)
{
tmp++;
categorylines.Add(a);
subcategorylines[tmp] = new List<string>();
}
if((a=GetLine(subcategoryRegex,0,0,line))!=null)
{
subcategorylines[tmp].Add(a);
}
}
public void ExtractArticleMeta() {
tmp = 0;
string a;
foreach (var item in Categories.Category)
{
articleLines[tmp] = new List<string>();
foreach (var subcat in item.SubCategories)
{
client = new WebClient();
strm = client.OpenRead("http://"+subcat.Url);
strrdr = new StreamReader(strm, Encoding.ASCII);
while (strrdr.Peek()>0)
{
string line = strrdr.ReadLine();
line = line.Replace("\n", String.Empty);
line = line.Replace("\t", string.Empty);
line = line.Replace("\r", string.Empty);
line = line.Replace("\\", "");
if ((a= GetLine(articleregex,0,0,line))!=null)
{
articleLines[tmp].Add(a);
}
}
}
tmp = tmp + 1;
}
}