ENGLISH 意见建议 网站地图 网站帮助
广泛智力汇聚   高效成果传播   先进机制培育
联盟首页  |  协同开发  |  开放源码库  |  安全告警  |  开源导航  |  文档中心  |  服务支持  |  共创论坛  |  关于联盟


注册会员 网站帮助
    您的位置 »
    今天是: 2010年11月22日    
项目搜索

完全匹配   
开源软件
软件分类表
新发布软件
其它网站镜像
代码片断
协同开发
文档
论坛
寻求协助
热点项目
站点状态
编译工厂

联系我们
关于联盟

代码片段库:
查看代码片段

浏览 | 提交新的代码片段 | 创建代码包

抓取网页

类型:
Class
类别:
HTML Manipulation
许可证:
GNU General Public License
语言:
PHP
 
描述:
抓取网页,并将文字和图片存入数据库中,利用getimg.php?id=读取数据库中的图片
getarticle.php?id=读取文档

该代码片段的版本系列:

片段ID 下载版本 提交时间 提交人 删除
45840.32003-07-14 13:42chinadba

点击"下载版本"来下载该代码片段.


最新版本的代码片段: 0.3


<?

/**建表文档 articletype对应的类型 1:oracle,2:java,3:system
CREATE TABLE article (
  id int(6) NOT NULL auto_increment,
  title varchar(80) default NULL,
  content text,
  url varchar(80) default NULL,
  joindate varchar(12) default NULL,
  articletype int(2) not null,
  PRIMARY KEY  (id)
) ;
CREATE TABLE images (
  id int(4) NOT NULL auto_increment,
  bin_data longblob,
  filetype varchar(50) default NULL,
  title varchar(50) default NULL,
  articleid int(6) NOT NULL,
  PRIMARY KEY  (id)
) TYPE=MyISAM;
*/

class SaveWeb
{
	var $title;
	var $url;
	var $typeid;
	var $content;
	var $getUrl = true;
	var $getimg = "getimg.php?id=";
	var $dbuser = "root";
	var $dbpassword = "whf76128";
	var $dbname = "tech";
	var $dbhost = "127.0.0.1";

	function SaveWeb($title,$url,$typeid) //初始化,
	{
		$this->title=$title;
		$this->url=$url;
		$this->typeid=$typeid;
	}
	function setContent($html) //初始化,
	{
		$this->content = $html;
		$this->getUrl = false;
	}
	function saveContent() //直接存储段落文字
	{
		$date = gmdate("Y-m-d");
		$data = nl2br($this->content);
		$data = addslashes($data);
		MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);
		mysql_select_db( $this->dbname);	
		$result=MYSQL_QUERY( "INSERT INTO article (title,content,url,joindate,articletype)  VALUES ('$this->title','$data','$this->url','$date',$this->typeid)"); 
		$id= mysql_insert_id();
		MYSQL_CLOSE();
		return $id;
	}
	function webSave() //存储页面
	{
		if($this->title==""||$this->url=="")
			return false;
		if($this->getUrl==true)
			$text = $this->getHtml($this->url);
		else
		{
			$text = $this->content;
		}
		$text2 = $this->parserHtml($text);
		$id = $this->saveHtml($text2);
		$this->updateImgPID($id,$this->title);
		$this->delimg();
		return $id;
	}

	//在$strobj中查找$strchild,返回值为位置(找到)和false(没有找到相应的字符串). 
	function strfind($strobj,$strchild,$int) 
	{ 
		$intobj=strlen($strobj); 
		$intchild=strlen($strchild);
		
		while($int<=$intobj) 
		{ 
			if(strtolower(substr($strobj,$int,1))==$strchild[0]) //当从$strobj上截取的首字符与$strchild的首字符相同时,作进一步判断. 
			{ 
				if(strtolower(substr($strobj,$int,$intchild))==$strchild)			
					return $int;
			}
			$int++;			
		} 
		return false;
	}

	function getHtml($url)
	{
		if(($fp = fopen($url,"r"))==false) 
		{
			echo "<font color=red>读取失败,文件位置:$url</font><br>";
			return false;
		}
			
		$data = "";
		while(!feof($fp))
		{		
			$data = $data.fread($fp,512);
		}
		fclose($fp);
		return $data;
	}
	function delImg()
	{
		MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);
		mysql_select_db( $this->dbname);	
		$result=MYSQL_QUERY( "delete from images where articleid = 0"); 		
		MYSQL_CLOSE();
	}
	function updateImgPID($id,$title)
	{
		MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);
		mysql_select_db( $this->dbname);	
		MYSQL_QUERY( "update images set articleid = $id where title='$title'");
		MYSQL_CLOSE();
	}

	function saveHtml($data)
	{
		$date = gmdate("Y-m-d");
		$data = addslashes($data);
		MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);
		mysql_select_db( $this->dbname);	
		$result=MYSQL_QUERY( "INSERT INTO article (title,content,url,joindate,articletype)  VALUES ('$this->title','$data','$this->url','$date',$this->typeid)"); 
		$id= mysql_insert_id();
		MYSQL_CLOSE();
		return $id;
	}

	function saveImg($url)
	{
		$data = $this->getHtml($url);
		$data = addslashes($data);
		MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);
		mysql_select_db( $this->dbname);	
		$result=MYSQL_QUERY( "INSERT INTO images (bin_data,filetype,title,articleid)  VALUES ('$data','".$this->getContentType($url)."','$this->title',0)"); 
		$id= mysql_insert_id();
		MYSQL_CLOSE();
		return $id;
	}

	function getContentName($inFileName)
	{
		return basename($inFileName); 
	}
	function getContentType($inFileName)
	{ 
		//--剥去路径
		$inFileName = basename($inFileName); 
		//--检查文件扩展名 
		if(strrchr($inFileName, ".") == false)
		{ 
			return  "application/octet-stream"; 
		} 
		 //--得到文件扩展名,并判断文件类型
		$extension = strrchr($inFileName, "."); 
		switch($extension)
		{ 
			case  ".gif":	return  "image/gif"; 
			case  ".gz":	return  "application/x-gzip"; 
			case  ".htm":	return  "text/html"; 
			case  ".html":	return  "text/html"; 
			case  ".jpg":	return  "image/jpeg"; 
			case  ".tar":	return  "application/x-tar"; 
			case  ".txt":	return  "text/plain"; 
			case  ".zip":	return  "application/zip"; 
			case  ".png":	return	"image/png";
			case  ".bmp":	return	"image/bmp";
			default:        return  "application/octet-stream"; 
		} 
		return  "application/octet-stream"; 
	} 

	function parserHtml($text)
	{
		$int = 0;
		$baseUrl = parse_url($this->url);
		$urlHost = "http://".$baseUrl["host"];
		$urlDir = $urlHost.dirname($baseUrl["path"]);
		$urlDir = str_replace("\\","/",$urlDir);
		//更新<img>标签
		while($int = $this->strfind($text,"<img",$int))
		{	
			$closeCharPos = $this->strfind($text,">",$int);
			$tmpTxt = substr($text,$int,$closeCharPos-$int+1);
			$srcStart = $this->strfind($tmpTxt,"src=",0);		
			$srcEnd = 0;
			switch(substr($tmpTxt,$srcStart+4,1))
			{
				case '"':
					$srcEnd = $this->strfind($tmpTxt,'"',$srcStart+5);				
					$imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5);			
					break;
				case "'":
					$srcEnd = $this->strfind($tmpTxt,"'",$srcStart+5);
					$imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5);			
					break;
				default:
					$srcEnd = $this->strfind($tmpTxt," ",$srcStart+4);
					if($srcEnd == false)
						$srcEnd = $this->strfind($tmpTxt,'>',$srcStart+4);				
					$imgUrl = substr($tmpTxt,$srcStart+4,$srcEnd-$srcStart-4);			
			}	
			$tempImgUrl = $imgUrl;
			$tempFile = parse_url($this->getimg);
			
			
			if($this->strfind($tmpTxt,"http://",0)!=true)
			{
				switch(substr($imgUrl,0,1))
				{
					case "/":
						$imgUrl = $urlHost.$imgUrl;
						break;
					default:
						if(substr($urlDir,strlen($urlDir)-1,1)=="/")
							$imgUrl = $urlDir.$imgUrl;
						else
							$imgUrl = $urlDir."/".$imgUrl;
				}
			}		
			
			if($this->strfind($imgUrl,$tempFile["path"],0)!=false)
			{
				$int++;
				continue;
			}
			$id = $this->saveImg($imgUrl);		
			if($id == false)
			{
				$int++;
				continue;
			}
			$newImgUrl = $this->getimg.$id;
			$text = str_replace($tempImgUrl,$newImgUrl,$text);		
			$int++;		
		}
		$int = 0;
		//更新<a></a>标签
		while($int = $this->strfind($text,"<a",$int))
		{	
			$closeCharPos = $this->strfind($text,">",$int);
			$tmpTxt = substr($text,$int,$closeCharPos-$int+1);
			$srcStart = $this->strfind($tmpTxt,"href=",0);		
			$srcEnd = 0;
			switch(substr($tmpTxt,$srcStart+5,1))
			{
				case '"':
					$srcEnd = $this->strfind($tmpTxt,'"',$srcStart+6);				
					$imgUrl = substr($tmpTxt,$srcStart+6,$srcEnd-$srcStart-6);			
					break;
				case "'":
					$srcEnd = $this->strfind($tmpTxt,"'",$srcStart+6);
					$imgUrl = substr($tmpTxt,$srcStart+6,$srcEnd-$srcStart-6);			
					break;
				default:
					$srcEnd = $this->strfind($tmpTxt," ",$srcStart+5);
					if($srcEnd == false)
						$srcEnd = $this->strfind($tmpTxt,'>',$srcStart+5);				
					$imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5);			
			}	
			$tempImgUrl = $imgUrl;			
			if($this->strfind($tmpTxt,"http://",0)!=true)
			{
				switch(substr($imgUrl,0,1))
				{
					case "/":
						$imgUrl = $urlHost.$imgUrl;
						break;
					default:
						if(substr($urlDir,strlen($urlDir)-1,1)=="/")
							$imgUrl = $urlDir.$imgUrl;
						else
							$imgUrl = $urlDir."/".$imgUrl;
				}
				$text = str_replace($tempImgUrl,$imgUrl,$text);
			}					
			$int++;		
		}
		return $text;
	}
}


?>
		

提交新版本

如果您修改了一个代码片段并且觉得很应该让别人共享,您可以把这作为这个代码片段的最新版本提交上来.


联盟团体会员
合作伙伴
© 共创软件联盟 版权所有
联盟服务条款 | 联盟隐私权规则 | 联系我们
电话: (8610)68313388-5949 | 传真: (8610)88377936
京ICP备05056057号