<?php // 1. 初始化 $ch = curl_init(); // 2. 设置选项,包括URL curl_setopt($ch, CURLOPT_URL, "http://blog.snsgou.com"); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, 0); // 3. 执行并获取HTML文档内容 $output = curl_exec($ch); // 4. 释放curl句柄 curl_close($ch); ?>
curl_setopt中的 CURLOPT_URL, CURLOPT_RETURNTRANSFER 等参数,请参考php文档手册,里面有详细说明!
现在得到$output内容…使用正则表达式匹配出你需要的内容。
<?php /** * 采集类 */ class Gather { public $pagestring = ''; private $db; function __construct() { global $db; $this->db = $db; } function getUrlFile($url) { $url = trim($url); $content = ''; if (extension_loaded('curl')) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_HEADER, 0); $content = curl_exec($ch); curl_close($ch); } else { $content = file_get_contents($url); } return trim($content); } function get_all_url($code) { preg_match_all('/<a.+?href=["|\']?([^>"\' ]+)["|\']?\s*[^>]*>([^>]+)<\/a>/is', $code, $arr); return array('name' => $arr[2], 'url' => $arr[1]); } function get_sub_content($str, $start, $end) { $start = trim($start); $end = trim($end); if ($start == '' || $end == '') { return $str; } $str = explode($start, $str); $str = explode($end, $str[1]); return $str[0]; } function vd($var) { echo "<div style=\"border:1px solid #ddd;background:#F7F7F7;padding:5px 10px;\">\r\n"; echo "<pre style=\"font-family:Arial,Vrinda;font-size:14px;\">\r\n"; var_dump($var); echo "\r\n</pre>\r\n"; echo "</div>"; } } ?>
<?php define('ROOT_PATH', str_replace('\\', '/', dirname(__FILE__))); //include ROOT_PATH."/Gather.class.php"; set_time_limit(0); header("Content-type: text/html; charset=gb2312"); //目标网址 $url = 'http://news.163.com/special/00013C0O/guojibjtj_03.html'; //实例化采集机器 $gather = new Gather(); //获取目标网址HTML $html = $gather->getUrlFile($url); //定义采集列表区间 $start = '<div class="bd clearfix">'; $end = '<div class="pages-1 mt25">'; //获取区间内的文章URL和TITLE $code = $gather->get_sub_content($html, $start, $end); $newsAry = $gather->get_all_url($code); //打印出结果 $gather->vd($newsAry); $tarGetUrl = $newsAry['url'][0]; //获取目标网址HTML $html = $gather->getUrlFile($tarGetUrl); //定义采集列表区间 $start = '<div id="endText">'; $end = '<span class="cDGray right" style="white-space:nowrap;">'; //获取区间内的文章URL和TITLE $code = $gather->get_sub_content($html, $start, $end); $killHtml = '<iframe src="http://g.163.com/r?site=netease&affiliate=news&cat=article&type=tvscreen200x300&location=1" width="200" height="300" frameborder="no" border="0" marginwidth="0" marginheight="0" scrolling="no"></iframe>'; $killHtml2 = '<a href="http://news.163.com/"><img src="http://img1.cache.netease.com/cnews/img07/end_i.gif" alt="netease" width="12" height="11" border="0" class="icon" /></a>'; $code = str_replace($killHtml, "", $code); $code = str_replace($killHtml2, "", $code); $gather->vd($code); ?>