Python-实现网络爬虫、蜘蛛.doc
文本预览下载声明
|
python 中如何提取网页正文啊 谢谢
import urllib.request ? url=/ ? response=urllib.request.urlopen(url) ? page=response.read() ?
python提取网页中的文本
import os,sys,datetime???
import httplib,urllib, re???
from sgmllib import SGMLParser???
??
import types???
??
class Html2txt(SGMLParser):???
????def reset(self):???
????????self.text = ??
????????self.inbody = True??
???????? SGMLParser.reset(self)???
????def handle_data(self,text):???
????????if self.inbody:???
????????????self.text += text???
??
????def start_head(self,text):???
????????self.inbody = False??
????def end_head(self):???
????????self.inbody = True??
??
??
if __name__ == __main__:???
???? parser = Html2txt()???
???? parser.feed(urllib.urlopen().read())???
???? parser.close()???
????print parser.text.strip()??
python 下载网页
import httplib??
conn=httplib.HTTPConnection( HYPERLINK /)
conn.request(GET,/index.html)
r1=conn.getresponse()
print r1.status,r1.reason
data=r1.read()
print data
conn.close
用python下载网页,超级简单!
from urllib import urlopenwebdata = urlopen().read()print webdata
深入python里面有
python?下载网页内容,用python的pycurl模块实现
用python 下载网页内容还是很不错的,之前是使用urllib模块实验的,但听说有pycurl这个模块,而且比urllib好,所以尝试下,废话不说,以下是代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import StringIO
import pycurl
def writefile(fstr,xfilename):? f=open(xfilename,w)? f.write(fstr)? f.close
html = StringIO.StringIO()
c = pycurl.Curl()
myurl= HYPERLINK / \n _blank
?
c.setopt(pycurl.URL, myurl)
?
#写的回调
c.setopt(pycurl.WRITEFUNCTION, html.write)
?
c.setopt(pycurl.FOLLOWLOCATION, 1)
?
#最大重定向次数,可以预防重定向陷阱
c.setopt(pycurl.MAXREDIRS, 5)
?
#连接超时设置
c.setopt(pycurl.CONNECTTIMEOUT, 60)
c.setopt(pycurl.TIMEOUT, 300)
?
#模拟浏览器
c.setopt(pycurl.USERAGENT, Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322))
?
?
?
#访问,阻塞到访问结束
c.perform()
?
#打印出 200(HTTP状态码,可以不需要)
print c.getinfo(pycurl.HTTP_CODE)
?
#输出网页的内容
print html.getvalue()
#保存成down.txt文件
writefile(html.getvalue(),down.txt)
HYPERLINK /archives/category/python/ \n _blan
显示全部