Click here to Skip to main content
15,881,967 members
Please Sign up or sign in to vote.
0.00/5 (No votes)
See more:
I have programmed a crawler, but it can just work when the page is utf-8 encoded only. Could you help to make it work with utf-8, gb2312 or others?
Thanks. My code is like follows:


HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
req.Timeout = Settings.ConnectionTimeout * 1000;
HttpWebResponse response = (HttpWebResponse)req.GetResponse();
string contentType = crawler.MimeType = response.ContentType;

if (contentType != "text/html" &&
    !crawler.Downloader.AllowAllMimeTypes &&
    !crawler.Downloader.FileTypes.Contains(contentType))
    return;

byte[] buffer = ReadInstreamIntoMemory(response.GetResponseStream());

response.Close();



if (!Directory.Exists(Settings.DownloadFolder))
    Directory.CreateDirectory(Settings.DownloadFolder);

// 保存页面(到网页库).
crawler.Status = CrawlerStatusType.Save;
if (crawler.Dirty)
    crawler.StatusChanged(crawler, null);



crawler.Downloader.CrawledUrlSet.Add(url);
crawler.Downloader.CrawleHistroy.Add(new CrawleHistroyEntry() { Timestamp = DateTime.UtcNow, Url = url, Size = response.ContentLength });
lock (crawler.Downloader.TotalSizelock)
{
    crawler.Downloader.TotalSize += response.ContentLength;
}

// 提取URL并加入队列.
UrlFrontierQueueManager queue = crawler.Downloader.UrlsQueueFrontier;

if (contentType == "text/html")
{

    crawler.Status = CrawlerStatusType.Parse;
    if (crawler.Dirty)
        crawler.StatusChanged(crawler, null);

    string html = Encoding.Default.GetString(buffer);

    string str = html;
    string regstr = @"[a-zA-Z0-9]+@([a-zA-Z0-9]+\.)+[a-zA-Z0-9]{2,3}";
    string mg = "";
    System.Text.RegularExpressions.Regex rg = new System.Text.RegularExpressions.Regex(regstr);


    System.Text.RegularExpressions.MatchCollection mc = rg.Matches(str);
    for (int i = 0; i < mc.Count; i++)
    {

        string xstr = mc[i].ToString();
        SQLiteConnection sqlliteconn = new SQLiteConnection(@"Data Source=" + Settings.SavePath);
        SQLiteCommand sqlcmd = new SQLiteCommand(sqlliteconn);
        sqlliteconn.Open();

        sqlcmd.CommandText = "select count(*) from rec_email where email like '%"+xstr+"%'";
        object obj = sqlcmd.ExecuteScalar();
        if (obj != null&&int.Parse(obj.ToString())>0)
        {


        }
        else
        {
            if (!mg.Contains(mc[i].ToString()))
            {
                mg += "," + mc[i].ToString();
            }
        }
        sqlliteconn.Close();
    }

    if (mg != "")
    {
                SQLiteConnection sqlliteconn = new SQLiteConnection(@"Data Source=" + Settings.SavePath);
        SQLiteCommand sqlcmd = new SQLiteCommand(sqlliteconn);
        sqlliteconn.Open();

        sqlcmd.CommandText = "insert into rec_email(url,email) values('"+url+"','"+mg+"')";
        sqlcmd.ExecuteNonQuery();
        sqlliteconn.Close();

    }

    string baseUri = Utility.GetBaseUri(url);
    string[] links = Parser.ExtractLinks(baseUri, html);
    foreach (string link in links)
    {
               if (link.Length > 256) continue;
                if (crawler.Downloader.CrawledUrlSet.Contains(link)) continue;
             queue.Enqueue(link);
          }


Please help me .Waiting on line.Thanks.:rose:
Posted
Updated 31-Oct-10 19:23pm
v2
Comments
JF2015 1-Nov-10 1:22am    
Edited for Spelling and formatting.

1 solution

You should read the html as a byte array and check what is the encoding on it...
There are plenty of encoding detecting methods available out there.

http://www.west-wind.com/Weblog/posts/197245.aspx[^]
 
Share this answer
 

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900