我的代码:
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
#百度贴吧爬虫类
class BDTB:
#初始化,传入基地址,是否只看楼主的参数
def __init__(self,baseUrl,seeLZ):
self.baseURL = baseUrl
self.seeLZ = '?see_lz='+str(seeLZ)
#传入页码,获取该页帖子的代码
def getPage(self,pageNum):
try:
url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
# print response.read()
return response
except urllib2.URLError, e:
if hasattr(e,"reason"):
print u"连接百度贴吧失败,错误原因",e.reason
return None
def getTitle(self):
page = self.getPage(1)
#<h3 class="core_title_txt pull-left text-overflow "纯原创我心中的NBA2014-2015赛季现役50大</h3>
pattern=re.compile('<h3 class="core_title_txt pull-left text-overflow.*?>(.*?)</h3>',re.S)
result=re.search(pattern,page)
if result:
print result.group(1) #测试输出
#如果存在,则返回标题
return result.group(1).strip()
else:
return None
baseURL = 'http://tieba.baidu.com/p/3138733512'
bdtb = BDTB(baseURL,1)
bdtb.getContent(1)
运行后弹出
TypeError: expected string or buffer
为什么会这样
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
#百度贴吧爬虫类
class BDTB:
#初始化,传入基地址,是否只看楼主的参数
def __init__(self,baseUrl,seeLZ):
self.baseURL = baseUrl
self.seeLZ = '?see_lz='+str(seeLZ)
#传入页码,获取该页帖子的代码
def getPage(self,pageNum):
try:
url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
# print response.read()
return response
except urllib2.URLError, e:
if hasattr(e,"reason"):
print u"连接百度贴吧失败,错误原因",e.reason
return None
def getTitle(self):
page = self.getPage(1)
#<h3 class="core_title_txt pull-left text-overflow "纯原创我心中的NBA2014-2015赛季现役50大</h3>
pattern=re.compile('<h3 class="core_title_txt pull-left text-overflow.*?>(.*?)</h3>',re.S)
result=re.search(pattern,page)
if result:
print result.group(1) #测试输出
#如果存在,则返回标题
return result.group(1).strip()
else:
return None
baseURL = 'http://tieba.baidu.com/p/3138733512'
bdtb = BDTB(baseURL,1)
bdtb.getContent(1)
运行后弹出
TypeError: expected string or buffer
为什么会这样
