#include "stdafx.h"
#include <windows.h>
#include <iostream>
#include <winsock2.h>
#include <ws2tcpip.h>
#include <vector>
#include <boost/regex.hpp>
#include <boost/lambda/lambda.hpp>
using namespace std;
using namespace boost;
#pragma comment(lib, "ws2_32.lib ")
WSAData Wsadata;
WORD wVersion;
struct DOC{
string pcClass;
string pcNm;
string pcNmWb;
string pcDocNm;
string pcDocWb;
string pcDy;
string pcHr;
string pcRpDy;
string pcRpHr;
string pcRead;
string pcRpCn;
};
HANDLE hSemaThr,hCont,hSema;
int CanThr;
int GetHostName(const string& HttpHead,string& hostname);
int BuildRequest(string& HttpRequest ,const string& url);
int ConnectClient(const string& hostname,SOCKET& client)
{
char* ip = new char[hostname.size()+1];
memset(ip,0,hostname.size()+1);
strcpy(ip,hostname.c_str());
char* port = "80";
struct addrinfo aiHints;
struct addrinfo *aiList = NULL;
int retVal;
memset(&aiHints, 0, sizeof(aiHints));
aiHints.ai_family = AF_INET;
aiHints.ai_socktype = SOCK_STREAM;
aiHints.ai_protocol = IPPROTO_TCP;
if ((retVal = getaddrinfo(ip, port, &aiHints, &aiList)) != 0) {
cout<<"getaddrinfo() failed.\n"<<GetLastError();
return 0;
}
else if(connect(client,(sockaddr*)aiList->ai_addr,sizeof(*aiList->ai_addr))==SOCKET_ERROR){
cout<<"connect error"<<WSAGetLastError()<<endl;
WSACleanup();
return 0;
}
else return 1;
}
int SendRecive(const string& request,SOCKET& client,string& answer)
{
answer.clear();
string HostName;
if(!GetHostName(request,HostName))
{
return 0;
}
if(!ConnectClient(HostName,client))
{
return 0;
}
char* crequest=new char[request.size()+1];
memset(crequest,0,request.size()+1);
strcpy(crequest,request.c_str());
send(client,crequest,(int)strlen(crequest),NULL);
delete[] crequest;
crequest=0;
char bufrecv[1000];
int byRecv=0;
while(byRecv!=SOCKET_ERROR){
memset(bufrecv,0,1000);
byRecv=recv(client,bufrecv,999,0);
if ( byRecv == 0 ){
cout<<"all received out"<<endl;
return 1;
}
else if(byRecv == WSAECONNRESET){
cout<<"wsaeconnreset"<<endl;
return 0;
}
answer+=string(bufrecv);
}
return 0;
}
int GetHostName(const string& HttpHead,string& hostname)
{
boost::regex rHostname("(?<=Host: )[\\s\\S]*?(?=\r\n)");
smatch sm;
std::string::const_iterator begin,end;
begin=HttpHead.begin();
end=HttpHead.end();
while(regex_search(begin,end,sm,rHostname)){
hostname=sm[0];
begin=sm[0].second;
return 1;
}
return 0;
}
DWORD WINAPI Grab(void* doc);
int _tmain(int argc, _TCHAR* argv[])
{
wVersion=MAKEWORD(1,1);
int iResult=::WSAStartup(wVersion,&Wsadata);
switch(iResult){
case WSASYSNOTREADY:
cout<<"not ready";
exit(1);
case WSAVERNOTSUPPORTED:
cout<<"version not supported";
exit(1);
case WSAEFAULT:
cout<<"wsadata fault";
exit(1);
}
const char cstrFilename[]="webpage.txt";
HANDLE hdFile=CreateFile(cstrFilename,GENERIC_READ|GENERIC_WRITE,FILE_SHARE_READ|FILE_SHARE_WRITE,NULL,OPEN_ALWAYS,FILE_ATTRIBUTE_NORMAL,NULL);
if(hdFile==INVALID_HANDLE_VALUE){
cout<<"cant't create the file;"<<GetLastError()<<endl;
return 0;
}
HANDLE hErrorFile=CreateFile("ERROR.doc",GENERIC_READ|GENERIC_WRITE,FILE_SHARE_READ|FILE_SHARE_WRITE,NULL,OPEN_ALWAYS,FILE_ATTRIBUTE_NORMAL,NULL);
if(hdFile==INVALID_HANDLE_VALUE){
cout<<"cant't create the file;"<<GetLastError()<<endl;
return 0;
}
SetFilePointer(hdFile,0,0,FILE_BEGIN);
char HttpRequestFile[1000];
memset(HttpRequestFile,0,1000);
DWORD nRead;
BOOL bResult=ReadFile(hdFile,HttpRequestFile,500,&nRead,NULL);
if(!bResult){
cout<<"readfile error"<<GetLastError()<<endl;
return 0;
}
string HttpRequest(HttpRequestFile);
string HostName;
SOCKET client=socket(AF_INET,SOCK_STREAM,0);
if(client==INVALID_SOCKET){
cout<<endl<<"socket error:"<<WSAGetLastError()<<endl;
WSACleanup();
return 0;
}
string HttpResponse;
if(!SendRecive(HttpRequest, client,HttpResponse))
{
cout<<"Receive none"<<endl<<GetLastError()<<endl;
return 0;
}
closesocket(client);
hCont=CreateFile("Contents.doc",GENERIC_WRITE|GENERIC_READ,FILE_SHARE_WRITE|FILE_SHARE_READ,0,OPEN_ALWAYS,FILE_ATTRIBUTE_NORMAL,0);
if(hdFile==INVALID_HANDLE_VALUE){
cout<<"cant't create the file;"<<GetLastError()<<endl;
return 0;
}
SetFilePointer(hCont,0,0,FILE_END);
DWORD time=GetTickCount();
boost::regex rPost("([A-Z])[\\s\\S]*?href='([\\s\\S]+?)'[\\s\\S]+?>([\\s\\S]+?)[\\s\\S]+?)(\\d+?)[\\s\\S]+?tdfont>(\\d+?)[\\s\\S]+?tdfont>([\\s\\S]+?)\\s([\\s\\S]+?)"),rNextPage("上一页\\s<a\\shref>下一页"),rNextPage2("<a\\shref>下一页");
smatch sm;
string::const_iterator HttpResponseBegin,HttpResponseEnd;
string NextPageUrl;
int i=1;
dd;
while(++i){
if(i/10==0) Sleep(1000);
_ASSERTE(HttpResponse.size()>0);
HttpResponseBegin=HttpResponse.begin();
HttpResponseEnd=HttpResponse.end();
while(regex_search(HttpResponseBegin,HttpResponseEnd,sm,rPost))
{
HttpResponseBegin=sm[0].second;
DOC dc;
dc.pcClass=sm[1];
dc.pcDocWb=sm[2];
dc.pcDocNm=sm[3];
dc.pcNmWb=HostName;
dc.pcNmWb+=sm[4];
dc.pcNm=sm[5];
dc.pcRead=sm[6];
dc.pcRpCn=sm[7];
dc.pcDy=sm[8];
dc.pcHr=sm[9];
int nRead=atoi(dc.pcRead.c_str());
if (nRead<10000) continue;
string Answer=dc.pcDocNm+"\n"+dc.pcDocWb+"\n"+"阅读次数:"+dc.pcRead+"\n"+"回复次数"+dc.pcRpCn+"\n"+dc.pcDy+"\n\n\n";
DWORD WordWritten;
if(!WriteFile(hCont,Answer.c_str(),Answer.size(),&WordWritten,0)){
cout<<"Cant't Write"<<Answer<<endl<<GetLastError()<<endl;
}
}
HttpResponseBegin=HttpResponse.begin();
if(!regex_search(HttpResponseBegin,HttpResponseEnd,sm,rNextPage)){
if(!regex_search(HttpResponseBegin,HttpResponseEnd,sm,rNextPage2) )
{
char* Buffer=new char[HttpResponse.size()+1];
memset(Buffer,0,HttpResponse.size()+1);
strcpy(Buffer,HttpResponse.c_str());
DWORD dw;
WriteFile(hErrorFile,Buffer,HttpResponse.size(),&dw,0);
break;
}
NextPageUrl=sm[1];
}
else{
NextPageUrl="http://www.tianya.cn/new/publicforum/articleslist.asp"+sm[1];
}
if(!BuildRequest(HttpRequest,NextPageUrl)){
cout<<"can't build request"<<endl;
return 0;
}
client=socket(AF_INET,SOCK_STREAM,0);
if(client==INVALID_SOCKET){
cout<<endl<<"socket error:"<<WSAGetLastError()<<endl;
WSACleanup();
return 0;
}
if(!SendRecive(HttpRequest,client,HttpResponse)){
cout<<"Cant receive "<<endl<<GetLastError()<<endl;
}
closesocket(client);
}
CloseHandle(hCont);
CloseHandle(hdFile);
WSACleanup();
system("pause");
return 0;
}
int BuildRequest(string& HttpRequest ,const string& url)
{
boost::regex rHttpRequest("(?<=http://)([\\s\\S]+?)(/[\\s\\S]+?)$");
smatch sm;
string::const_iterator begin,end;
begin=url.begin();
end=url.end();
string HostName,RequestHeader;
if(!regex_search(begin,end,sm,rHttpRequest)) return 0;
begin=sm[0].second;
HostName=sm[1];
RequestHeader=sm[2];
HttpRequest="GET " + RequestHeader+" HTTP/1.1\r\n" +"Accept: */*\r\nReferer: http://www.tianya.cn/publicforum/Content/house/1/99375.shtml\r\nAccept-Language: zh-cn\r\nUA-CPU: x86\r\nIf-Modified-Since: Thu, 22 Jan 2009 02:44:12 GMT; length=1088\r\nUser-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; CIBA)\r\n"
+"Host: "+HostName
+"\r\nConnection: Keep-Alive\r\n\r\n";
return 1;
}
</vector></ws2tcpip.h></winsock2.h></iostream></windows.h>
in the webpage.txt file, the content is a httprequest stream.
GET /publicforum/articleslist/0/develop.shtml HTTP/1.1
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, *
modified on Sunday, March 8, 2009 7:29 AM
|