以前攒的一些函数,都是用来抓网页做分析的,有可能有失效的,凑合看吧

<?
	$user_agent = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; TencentTraveler ; .NET CLR 1.1.4322; InfoPath.1; .NET CLR 2.0.50727)';
	


function conv2utf8($data){
	$encode=get_pagecode($data);
	if($encode){
		$data=iconv($encode,'UTF-8//IGNORE',$data);
	}else{
		$data=iconv(mb_detect_encoding($data,'UTF-8,GB2312'),'UTF-8//IGNORE',$data);
	}
	return $data;
}

function get_pagecode($data){
	preg_match('/text\/html; charset=([a-z0-9-]*?)"/si', $data, $match );
	return strtoupper($match[1]);
}

 

function get_url_content($url){
	global $user_agent;
	$ch=curl_init();
	curl_setopt($ch, CURLOPT_URL,$url);
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
	curl_setopt($ch, CURLOPT_TIMEOUT, 15); 
	curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
	$data = curl_exec($ch);

	curl_close($ch);

	//return strip_tags($data,"");
	return conv2utf8($data);
}

function html2txt($document){
	$search = array('@<script[^>]*?>.*?</script>@si',  // Strip out javascript
				   '@<style[^>]*?>.*?</style>@siU',    // Strip style tags properly
				   '@<[\/\!]*?[^<>]*?>@si',            // Strip out HTML tags
				   '@<![\s\S]*?--[ \t\n\r]*>@',        // Strip multi-line comments including CDATA
				   '@ @' 
	);
	$text = preg_replace($search, '', $document);
	return $text;
}

function get_title($data){
	preg_match('/<title>([^>]*)<\/title>/si', $data, $match );
	return str_replace(' ', ' ', $match[1]);
}

 
function get_googlecompetition( $data){
	 
	preg_match_all('/<h3 class=r>(.*?)<\/h3>/si', $data, $match);
	$competition="";
	for($i=0; $i<count($match[1]); $i++){
		preg_match('/href="(.*?)"/si', $match[1][$i], $url);
		$url = $url[1];  
		
		$competition.="$url^";
	
	
	}
	return $competition;

}

 
function get_baidu_competition($keyword, $data){
	 
	preg_match_all('/<table border="0" cellpadding="0" cellspacing="0">(.*?)<\/table>/si', $data, $match);
	$competition="";
	for($i=0; $i<count($match[1]); $i++){
		preg_match('/href="([^"]*?)"/si', $match[1][$i], $url);
		$url = $url[1];  
		
		$competition.="$url^";
	
	
	}
	return $competition;

}
 
function get_blog_competition($keyword, $data){
	 
	preg_match_all('/<span class="445043119-04022004">(.*?)<\/p>/si', $data, $match);
	$competition="";
	for($i=0; $i<count($match[1]); $i++){
		preg_match('/href="([^"]*?)"/si', $match[1][$i], $url);
		$url = $url[1];  
		
		$competition.="$url^";
	
	
	}
	return $competition;

}
function get_jpg(  $data){
	 
	preg_match_all('/<img.*?(http.*?v.iask.com.*?jpg).*?\'>/si', $data, $match);
	$competition="";
	for($i=0; $i<count($match[1]); $i++){ 
		$url = $match[1][$i];// $url[1];  
		
		$competition.="$url";
	
	
	}
	return $competition;

}


?>

你可能感兴趣的:(以前攒的一些函数,都是用来抓网页做分析的,有可能有失效的,凑合看吧)