This is a feed parser prototype that I personally made in PHP. As you can see, it's a mess. No one is going to make it this way. :(
It uses a mysql database and links in tables store unique webpage links to documents. If this link doesn't exist, parse and save.
These links are indexed and are very fast.
To use it, type ./xparser.php -l in the console screen to parse and save the saved feed address from beginning to end. When done, repeat from the beginning. There are 17200 feed URLs and about 97000 saved feed documents.
The problem is that after 2-3 days of parsing, the hard disk head makes a moving sound and breaks down. Several hard disks have already failed. The SSD is also broken.
Are these feed parsing or crawling for retrieval inherently fatal to hard disks???
My code is messed up, but I don't think it's the cause of the hard disk error. Is it related to indexed links?
I wonder what is the cause of hard disk failure.
*Please forgive the lack of comments in the code. I can't write comments because I'm not an English speaker.
What I have tried:
<pre>#!/usr/bin/php
<?php
error_reporting(0);
ini_set("display_errors", 1);
require_once( "connect_db.php" );
$num = '1';
$option = '';
$limit = '';
$op = array( '-i', '-l', '-c', '-f', '-x', '-o');
if( $argv[1] ){
if( in_array( $argv[1], $op ) ){
$option = $argv[1];
}else{
echo "ERROR: Type of Option is Wrong or No Option That do for you!\n";
return ;
}
}
if( $argv[2] ){
if( is_numeric( $argv[2] ) ){
$num = $argv[2];
}else{
echo "ERROR: Type of Option must to be number.\n";
return ;
}
}
if( $argv[3] ){
$limit = $argv[3];
if( $num > $limit ){
echo "ERROR: Wrong option value! Cause can't over option 1 value better than option 2 value"."\n";
return ;
}
}
startCrw( $option , $num , $limit );
function getFeedAdr( $id ){
$conn = connect();
$result = $conn->query("select id, url from wc_xml_url where id={$id} and crawl_status='0'") or die( mysqli_error( $conn ).__LINE__);
return $result;
}
function count_status( $id = '' , $string_url = '' ){
$conn = connect();
$result = $conn->query("select incount, decount from schedule_crawle where url_id='".$id."'");
if( mysqli_num_rows($result) > 0 ){
$feed_result = mysqli_fetch_assoc( $result );
echo "\n\e[1;37;44mR-AVOID\e[0m " . " \033[1;36mID:\033[0m " . $id . " \033[1;36mInC\033[0m " . $feed_result['incount'] . " \033[1;36mDeC\033[0m " . $feed_result['decount'] . $string_url;
}
}
function startCrw( $option = '' , $num = '' , $limit = '' ){
$start_id = $num;
$max_client_value = '';
$max_status = true;
if( $limit === '' || empty( $limit ) ){
$max_status = false;
$max_result = get_limit_value();
if( !$max_result ){
echo "cannot connect db";
return ;
}
$max_client = mysqli_fetch_assoc( $max_result );
mysqli_free_result( $max_result );
$max_client_value = $max_client['id'];
}else if( $limit || isset( $limit ) ){
$max_client_value = $limit;
}
if( $option === '-f' ){
$result = getFeedAdr( $num );
if( mysqli_num_rows( $result ) ){
$feed_result = mysqli_fetch_assoc( $result );
echo "\033[1;33mCONNECT \033[0m>> " .$feed_result['id']." \e[1;37;40m". replace_ampersand( $feed_result['url'] )."\e[0m\n";
$parser_result = feedParser( $feed_result );
}
echo "\n" . $num . ": No value";
updateCrawleStatus( $num );
}else if( $option === '-x' ){
$result = getFeedAdr( $num );
if( mysqli_num_rows( $result ) ){
$feed_result = mysqli_fetch_assoc( $result );
echo "\033[1;33mCONNECT \033[0m>> " .$feed_result['id']." \e[1;37;40m". replace_ampersand( $feed_result['url'] )."\e[0m\n";
$xmlDoc = new DOMDocument();
$xmlDoc->load( replace_ampersand( $feed_result['url'] ) );
print_r( $xmlDoc );
}
}else if( $option === '-i' ){
$conn = connect();
$max_url_num = $conn->query("select count(*) from wc_xml_url");
$max_url = mysqli_fetch_row( $max_url_num );
$max_post_num = $conn->query("select count(*) from wc_xml_post");
$max_post = mysqli_fetch_row( $max_post_num );
echo "Count URL: ".$max_url['0']."\n";
echo "Count POST: ".$max_post['0']."\n";
echo "Max Url Id: ". $max_client_value."\n";
return ;
}else if( $option === '-l' || $option === '-c' || $option === '-o' ){
while( $num-2 < $max_client_value ){
$conn = connect();
$result = $conn->query("select id, url, crawl_status from wc_xml_url where id={$num} and crawl_status='0'") or die( mysqli_error( $conn ) );
$url = array();
if ( mysqli_num_rows( $result ) ){
$url = mysqli_fetch_assoc( $result );
if( scanSchedule( $url['id'] ) == false ){
$string_url = " \033[1;30m". replace_ampersand( $url['url'] )."\033[0m";
count_status($url['id'], $string_url);
}else{
echo "\033[1;33mCONNECT \033[0m>> " .$url['id']." \e[1;37;40m". replace_ampersand( $url['url'] )."\e[0m\n";
$result = feedParser( $url );
if(isset( $result ) && is_resource( $result )){
mysqli_free_result( $result );
}
}
}else{
echo "\n" . $num . ": No value";
}
if( $num == $max_client_value ){
if( $option == '-o' ){
$num = "1";
continue;
}else if( $option == '-l' ){
$num = $start_id;
continue;
}else if( $option == '-c' ){
break ;
}else
break;
}
$num++;
}
}
}
function scanSchedule( $url ){
$db = connect();
$query = $db->query("select incount, decount from schedule_crawle where url_id = '".$url."'") or die( mysqli_error( $db ) );
if(mysqli_num_rows( $query ) > 0 ){
$result = mysqli_fetch_assoc( $query );
if( $result['incount'] == $result['decount'] ){
return true;
}else if( $result['incount'] > $result['decount'] ){
$result = $db->query("update schedule_crawle set decount = decount + '1' , update_date = '".strftime("%Y-%m-%d %H:%M:%S")."' where url_id='".$url."'");
}else if( $result['incount'] < $result['decount'] ){
$result = $db->query("update schedule_crawle set incount = ROUND(incount/2), decount = '0', update_date = '".strftime("%Y-%m-%d %H:%M:%S")."' where url_id='".$url."'");
}
}else{
$result = $db->query("insert into schedule_crawle( url_id, incount, update_date, create_date) values('".$url."', '0','".strftime("%Y-%m-%d %H:%M:%S")."','".strftime("%Y-%m-%d %H:%M:%S")."')");
if( $result ){
return true;
}
}
}
function feedParser( $url = array(), $num = 0 ){
$succ_count = 0;
$url = preg_replace( array('/\s+/') , array('') ,$url);
$xmlDoc = new DOMDocument();
if( $xmlDoc->load( replace_ampersand( $url['url'] ) ) ){
if( $xmlDoc->getElementsByTagName('item')->item(0) ){
foreach( $xmlDoc->getElementsByTagName('item') as $x ){
$item_link = $x->getElementsByTagName('link')->item(0)->nodeValue;
if( detect( $item_link , $url['id'] ) == true ){
}else{
$query['url_id'] = $url['id'];
$query['xml'] = $url['url'];
$query['title'] = $x->getElementsByTagName('title')->item(0)->nodeValue;
$query['author'] = $x->getElementsByTagName('author')->item(0)->nodeValue;
$query['field'] = 'xml';
$query['link'] = $x->getElementsByTagName('link')->item(0)->nodeValue;
$query['pubdate'] = $x->getElementsByTagName('pubDate')->item(0)->nodeValue;
$query['description'] = $x->getElementsByTagName('description')->item(0)->nodeValue;
if( $query['description'] == '' || empty( $query['description'] ) ){
$query['description'] = $x->getElementsByTagName('encoded')->item(0)->nodeValue;
}
$query['enclosure'] = get_thumbnail_cnode( $x );
if( insertFeed( $query ) == 'true' ){
$succ_count = $succ_count+1;
}
if( $query['content'] ){ echo $query['content']; }
unset( $query );
}
}
}else if( $x = $xmlDoc->getElementsByTagName('entry') ){
for( $i = 0 ; $i < $x->length ; ++$i ){
$query['link'] = $x->item($i)->getElementsByTagName('link')->item(0)->getAttribute('href');
if( detect( $query['link'] , $url['id'] ) == true ){
}else{
$query['url_id'] = $url['id'];
$query['xml'] = $url['url'];
$query['title'] = $x->item($i)->getElementsByTagName('title')->item(0)->nodeValue;
$query['author'] = $x->item($i)->getElementsByTagName('author')->item(0)->nodeValue;
$query['field'] = "xml";
$query['link'] = $x->item($i)->getElementsByTagName('link')->item(0)->getAttribute('href');
$query['pubdate'] = $x->item($i)->getElementsByTagName('updated')->item(0)->nodeValue;
$query['description'] = $x->item($i)->getElementsByTagName('content')->item(0)->nodeValue;
if( $query['description'] == '' || empty( $query['description'] ) ){
$query['description'] = $x->item($i)->getElementsByTagName('encoded')->item(0)->nodeValue;
}
$query['enclosure'] = get_thumbnail_cnode( $x->item($i) );
if( insertFeed( $query ) == 'true' ){
$succ_count = $succ_count+1;
}
unset( $query );
}
}
}
}else{
if( updateCrawleStatus( $url['id'] )){
$string = "\033[31mE: Failed\033[0m ID: " . $url['id'] . " \033[1;30m". $url['url']."\033[0m \n";
$file = 'log/Disallowed_Url_';
if( file_exists( $file.date( 'Y-m-d', time()).".txt" ) ){
file_writing( $file, $string );
}else{
if( $fh = fopen( $file.date( 'Y-m-d', time()).".txt", 'c') or die("failed to create file".__LINE__."\n")){
fclose($fh);
file_writing( $file, $string );
}else{
echo "Could not write to file: 2";
}
}
}
}
updateSchedule( $url['id'] , $succ_count );
}
function updateCrawleStatus( $url_id = '' , $line ){
$conn = connect();
$update_status = $conn->query("select crawl_status from wc_xml_url where url='".$url_id."'") or die( mysqli_error( $conn ));
if( mysqli_num_rows( $update_status ) > 0 ){
$result = $conn->query("update wc_xml_url set crawl_status='1' where id='".$url_id."'") or die( mysqli_error( $conn));
echo "\e[1;37;41mDisallowed Url: ".$url_id." ".$line."\e[0m\n";
return true;
}else{
return false;
}
}
function updateSchedule( $url = '' , $success_count = '' ){
$conn = connect();
$db = $conn->query("SELECT * from schedule_crawle where url_id='".$url."'") or die ("Database access failed: " . mysqli_error($conn).__LINE__);
if( mysqli_num_rows( $db ) > 0 ){
if( $success_count > 0 ){
$result = $conn->query("update schedule_crawle set incount = ROUND(incount/2), decount = '0', count = '".$success_count."' , update_date = '".strftime("%Y-%m-%d %H:%M:%S")."' where url_id='".$url."'");
echo "\e[7mupdate ".$success_count." ".$url." \e[0m\n";
}else if( $success_count == 0 ){
$result = $conn->query("update schedule_crawle set incount = incount+1 , decount = '0', count = '0', update_date = '".strftime("%Y-%m-%d %H:%M:%S")."' where url_id='".$url."'");
}
}
}
function mathRound( $int = '' ){
$result = $int / 2;
return round($result,0);
}
function realEncode( $conn , $url = '' ){
return htmlentities(mysqli_real_escape_string( $conn, $url ) , ENT_QUOTES , 'UTF-8' );
}
function insertFeed( $query = array() ){
$conn = connect();
$query['author'] = setSlashTag( $query['author'] );
$query['category'] = setSlashTag( $query['category'] );
$query['link'] = realEncode( $conn , $query['link'] );
$query['title'] = setSlashTag( $query['title'] );
if( !is_null( $query['enclosure'] ) ){
$query['enclosure'] = realEncode( $conn , $query['enclosure'] );
}else{
$query['enclosure'] = NULL;
}
$ipr = getFeedImage( $query['description'] );
if( is_array( $ipr )){
$query['img'] = $ipr[0];
$query['imgurl'] = $ipr[1];
}
if( !is_null( $query['imgurl'] ) ){
$query['imgurl'] = realEncode( $conn , $query['imgurl'] );
}else{
$query['imgurl'] = NULL;
}
$item_desc = preg_replace( '/<script\b[^>]*>(.*?)<\/script>/is', "", $query['description'] );
$query['description'] = realEncode( $conn , $item_desc );
$query['pubdate'] = cvrtStrtoDate( $query['pubdate'] );
$result = $conn->query ("SELECT * from wc_xml_post where link='".$query['link']."' and url_id='".$query['url_id']."'") or die ("Database access failed: " . mysqli_error($conn) . __LINE__ );
if( mysqli_num_rows( $result ) > 0 ){
$error = mysqli_fetch_assoc( $result );
$sendError = "\033[31mE: Crawled\033[0m " . $error['id'] . " \033[1;30m" . sainitString($error['link']) . "\033[0m " . sainitString($error['title']) . "\n Date: " . date("Y-m-d h:i:s",$date['reg_date']);
echo $sendError;
writeLog( $sendError );
}else{
$setTime = strftime("%Y-%m-%d %H:%M:%S");
$rss_query = $conn->query("insert into wc_xml_post(url_id, author, category, link, title, enclosure, image, field, pubdate, parse_date )values('".$query['url_id']."','".$query['author']."','".$query['category']."','".$query['link']."','".$query['title']."','".$query['enclosure']."','".$query['imgurl']."','".$query['field']."','".$query['pubdate']."','".$setTime."' )")or die ("Insert failed: " . __LINE__ . mysqli_error($conn));
$insert_id = mysqli_insert_id( $conn );
if( $insert_id ){
$postmeta = $conn->query("insert into wc_xml_postmeta(xml_post_id, meta_value) values('".$insert_id."','".$query['description']."')");
$update_url = $conn->query("update wc_xml_url set last_updated='".$setTime."' where id='".$query['url_id']."'")or die(mysqli_error($conn));
}
echo "\033[36mSuccess\033[0m \033[1;37m" .$insert_id. " \033[0m ";
echo "\033[1;32m".$query['title']."\033[0m \n";
if( $query['imgurl'] )
echo "\033[33m".$query['imgurl']."\033[0m \n";
if( $query['enclosure'] )
echo "\033[35m".$query['enclosure']."\033[0m \n";
}
mysqli_free_result( $result );
mysqli_close( $conn );
return true;
}
function detect( $link , $url_id ){
$conn = connect();
try{
$result = $conn->query ("SELECT link from wc_xml_post where link='".realEncode( $conn , $link )."' and url_id='".$url_id."'");
if( !$result )
throw new Exception("Database access failed: " . mysqli_error($conn), 16);
}catch( Exception $e ){
$message = $e->getMessage()." ".$e->getLine();
writeLog($e->getMessage(), $e->getCode);
return false;
}
if( mysqli_num_rows( $result ) ){
$result = mysqli_fetch_assoc( $result );
echo "\033[31mCrawled\033[0m ".$result['link']."\n";
return true;
}else{
return false;
}
}
function getFeedImage( $description = '' ){
if( !$description || empty( $description ) ){
return "Error 1";
}
preg_match_all('/<img[^>]+>/i', stripslashes( $description ), $description );
foreach( $description as $imgVal){
foreach( $imgVal as $keystone ){
preg_match('/src="([^"]+)/i',$keystone , $result);
$foo = $result[1];
list($width, $height) = getimagesize($foo);
if( $width > 62 && $height > 62 ){
$image = array( $keystone , $foo );
return $image;
break;
}else{
return ;
}
}
}
}
function writeLog( $string = '' , $option = '' ){
$file = '';
if( $option == '16' ){
$file = "log/Exception";
}else{
$file = "log/error_log";
}
if( file_exists( $file.date( 'Y-m-d', time()).".txt" ) ){
file_writing( $file, $string );
}else{
if( $fh = fopen( $file.date( 'Y-m-d', time()).".txt", 'c') or die("failed to create file".__LINE__."\n")){
fclose($fh);
file_writing( $file, $string );
}else{
echo "Could not write to file: 2";
}
}
}
function file_writing( $file, $string ){
$timestamp = date( 'Y-m-d H:i:s', time());
if( $fh = fopen( $file.date( 'Y-m-d', time()).".txt", 'a+') or die("failed to create file".__LINE__."\n") ){
fwrite($fh, pack("CCC",0xef,0xbb,0xbf) );
fwrite($fh, $string." date: ".$timestamp."\n" ) or die("Could not write to file");
fclose($fh);
}
}
function get_xml_cnode( $parent = '' , $child = '' ){
if( $parent->getElementsByTagName($child) ){
if( $parent->getElementsByTagName($child)->item(0) ){
if( $parent->getElementsByTagName($child)->item(0)->childNodes->item(0) ){
return $parent->getElementsByTagName($child)->item(0)->childNodes->item(0)->nodeValue;
}else{
return null;
}
}else{
return null;
}
}else{
return null;
}
}
function cvrtStrtoDate( $string = '' ){
if( $string ){
$timestamp = strtotime( $string );
$time = date( "Y-m-d H:i:s", $timestamp );
return $time;
}else{
return null;
}
}
function get_thumbnail_cnode( $item = '' ){
if( $item->getElementsByTagName('enclosure')->item(0) ){
return $item->getElementsByTagName('enclosure')->item(0)->getAttribute('url');
}else if( $item->getElementsByTagName('thumbnail')->item(0) ){
return $item->getElementsByTagName('thumbnail')->item(0)->getAttribute('url');
}else if( $item->getElementsByTagName('content')->item(0) ){
return $item->getElementsByTagName('content')->item(0)->getAttribute('url');
}else{
return null;
}
}
function sainitString( $string = '' ){
$string = strip_tags( $string );
$string = addslashes( $string );
return $string;
}
function setSlashTag( $string = '' ){
$string = htmlentities( $string );
$string = addslashes( $string );
return $string;
}
function set_tag_n_slash( $var = array() ){
$count = count( $var );
for( $i = 0 ; $i< $count ; ++$i){
$var[$i] = htmlentities( $var[$i] );
$var[$i] = addslashes( $var[$i] );
}
return $var;
}
function off_tag_n_slash( $var = array() ){
$count = count( $var );
for( $i = 0 ; $i< $count ; ++$i){
$var[$i] = stripslashes( $var[$i] );
$var[$i] = html_entity_decode( $var[$i] );
}
return $var;
}
function get_limit_value(){
$conn = connect();
$max_result = $conn->query("select id from wc_xml_url order by id DESC limit 1");
return $max_result;
}
function replace_ampersand( $address = '' ){
return str_replace('&', '&', $address );
}
?>