HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url); req.Timeout = Settings.ConnectionTimeout * 1000; HttpWebResponse response = (HttpWebResponse)req.GetResponse(); string contentType = crawler.MimeType = response.ContentType; if (contentType != "text/html" && !crawler.Downloader.AllowAllMimeTypes && !crawler.Downloader.FileTypes.Contains(contentType)) return; byte[] buffer = ReadInstreamIntoMemory(response.GetResponseStream()); response.Close(); if (!Directory.Exists(Settings.DownloadFolder)) Directory.CreateDirectory(Settings.DownloadFolder); // 保存页面(到网页库). crawler.Status = CrawlerStatusType.Save; if (crawler.Dirty) crawler.StatusChanged(crawler, null); crawler.Downloader.CrawledUrlSet.Add(url); crawler.Downloader.CrawleHistroy.Add(new CrawleHistroyEntry() { Timestamp = DateTime.UtcNow, Url = url, Size = response.ContentLength }); lock (crawler.Downloader.TotalSizelock) { crawler.Downloader.TotalSize += response.ContentLength; } // 提取URL并加入队列. UrlFrontierQueueManager queue = crawler.Downloader.UrlsQueueFrontier; if (contentType == "text/html") { crawler.Status = CrawlerStatusType.Parse; if (crawler.Dirty) crawler.StatusChanged(crawler, null); string html = Encoding.Default.GetString(buffer); string str = html; string regstr = @"[a-zA-Z0-9]+@([a-zA-Z0-9]+\.)+[a-zA-Z0-9]{2,3}"; string mg = ""; System.Text.RegularExpressions.Regex rg = new System.Text.RegularExpressions.Regex(regstr); System.Text.RegularExpressions.MatchCollection mc = rg.Matches(str); for (int i = 0; i < mc.Count; i++) { string xstr = mc[i].ToString(); SQLiteConnection sqlliteconn = new SQLiteConnection(@"Data Source=" + Settings.SavePath); SQLiteCommand sqlcmd = new SQLiteCommand(sqlliteconn); sqlliteconn.Open(); sqlcmd.CommandText = "select count(*) from rec_email where email like '%"+xstr+"%'"; object obj = sqlcmd.ExecuteScalar(); if (obj != null&&int.Parse(obj.ToString())>0) { } else { if (!mg.Contains(mc[i].ToString())) { mg += "," + mc[i].ToString(); } } sqlliteconn.Close(); } if (mg != "") { SQLiteConnection sqlliteconn = new SQLiteConnection(@"Data Source=" + Settings.SavePath); SQLiteCommand sqlcmd = new SQLiteCommand(sqlliteconn); sqlliteconn.Open(); sqlcmd.CommandText = "insert into rec_email(url,email) values('"+url+"','"+mg+"')"; sqlcmd.ExecuteNonQuery(); sqlliteconn.Close(); } string baseUri = Utility.GetBaseUri(url); string[] links = Parser.ExtractLinks(baseUri, html); foreach (string link in links) { if (link.Length > 256) continue; if (crawler.Downloader.CrawledUrlSet.Contains(link)) continue; queue.Enqueue(link); }
var
This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)