登录
首页精彩阅读利用python实现新浪微博爬虫_python新浪微博爬虫
利用python实现新浪微博爬虫_python新浪微博爬虫
2016-12-30
收藏

利用python实现新浪微博爬虫_python新浪微博爬虫

本文后面的解决动态加载的程序依然有效

重新编辑了一次,出了点儿问题

第一个模块,模拟登陆sina微博,创建weiboLogin.py文件,输入以下代码:

[python]
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-

  3. import sys
  4. import urllib
  5. import urllib2
  6. import cookielib
  7. import base64
  8. import re
  9. import json
  10. import hashlib

  11. class weiboLogin:
  12.     cj = cookielib.LWPCookieJar()
  13.     cookie_support = urllib2.HTTPCookieProcessor(cj)
  14.     opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
  15.     urllib2.install_opener(opener)
  16.     postdata = {
  17.         ‘entry’: ‘weibo’,
  18.         ‘gateway’: ‘1’,
  19.         ‘from’: ”,
  20.         ‘savestate’: ‘7’,
  21.         ‘userticket’: ‘1’,
  22.         ‘ssosimplelogin’: ‘1’,
  23.         ‘vsnf’: ‘1’,
  24.         ‘vsnval’: ”,
  25.         ‘su’: ”,
  26.         ‘service’: ‘miniblog’,
  27.         ‘servertime’: ”,
  28.         ‘nonce’: ”,
  29.         ‘pwencode’: ‘wsse’,
  30.         ‘sp’: ”,
  31.         ‘encoding’: ‘UTF-8’,
  32.         ‘url’: ‘http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack’,
  33.         ‘returntype’: ‘META’
  34.     }

  35.     def get_servertime(self):
  36.         url = ‘http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=dW5kZWZpbmVk&client=ssologin.js(v1.3.18)&_=1329806375939’
  37.         data = urllib2.urlopen(url).read()
  38.         p = re.compile(‘(.∗)‘)
  39.         try:
  40.             json_data = p.search(data).group(1)
  41.             data = json.loads(json_data)
  42.             servertime = str(data[‘servertime’])
  43.             nonce = data[‘nonce’]
  44.             return servertime, nonce
  45.         except:
  46.             print ‘Get severtime error!’
  47.             return None

  48.     def get_pwd(self, pwd, servertime, nonce):
  49.         pwd1 = hashlib.sha1(pwd).hexdigest()
  50.         pwd2 = hashlib.sha1(pwd1).hexdigest()
  51.         pwd3_ = pwd2 + servertime + nonce
  52.         pwd3 = hashlib.sha1(pwd3_).hexdigest()
  53.         return pwd3

  54.     def get_user(self, username):
  55.         username_ = urllib.quote(username)
  56.         username = base64.encodestring(username_)[:-1]
  57.         return username


  58.     def login(self,username,pwd):
  59.         url = ‘http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.3.18)’
  60.         try:
  61.             servertime, nonce = self.get_servertime()
  62.         except:
  63.             print ‘get servertime error!’
  64.             return
  65.         weiboLogin.postdata[‘servertime’] = servertime
  66.         weiboLogin.postdata[‘nonce’] = nonce
  67.         weiboLogin.postdata[‘su’] = self.get_user(username)
  68.         weiboLogin.postdata[‘sp’] = self.get_pwd(pwd, servertime, nonce)
  69.         weiboLogin.postdata = urllib.urlencode(weiboLogin.postdata)
  70.         headers = {‘User-Agent’:‘Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0 Chrome/20.0.1132.57 Safari/536.11’}
  71.         req  = urllib2.Request(
  72.             url = url,
  73.             data = weiboLogin.postdata,
  74.             headers = headers
  75.         )
  76.         result = urllib2.urlopen(req)
  77.         text = result.read()
  78.         p = re.compile(‘location\.replace\’(.∗?)\’‘)
  79.         try:
  80.             login_url = p.search(text).group(1)
  81.             urllib2.urlopen(login_url)
  82.             print “Login success!”
  83.         except:
  84.             print ‘Login error!’

然后创建main.py文件,输入以下代码:

[python]
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-

  3. import weiboLogin
  4. import urllib
  5. import urllib2

  6. username = ‘你的微博用户名’
  7. pwd = ‘你的微博密码’

  8. WBLogin = weiboLogin.weiboLogin()
  9. WBLogin.login(username, pwd)

注意:若登陆失败,可能是你的账号在登陆的时候需要输入验证码!你在网页上登陆你的账号试试看,在账号设置里面可以设置某些地区不输入验证码。


接下来,考虑实现抓取微博的内容。

此时遇到一个困难,当抓取指定URL的微博时,初始显示只有15条。后面的是延迟显示的(ajax里面叫lazy load?)。也就是说,当滚动条第一次拖到最下面的时候,会显示第二部分,再拖到最下面,会显示第三部分。此时一个页面的微博才是完整的。所以,要获取一个微博页面的全部微博,需要访问这个页面三次。创建getWeiboPage.py文件,相应代码如下:

[python] view plain copy
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-

  3. import urllib
  4. import urllib2
  5. import sys
  6. import time

  7. reload(sys)
  8. sys.setdefaultencoding(‘utf-8’)

  9. class getWeiboPage:
  10.     body = {
  11.         ‘__rnd’:”,
  12.         ‘_k’:”,
  13.         ‘_t’:‘0’,
  14.         ‘count’:’50’,
  15.         ‘end_id’:”,
  16.         ‘max_id’:”,
  17.         ‘page’:1,
  18.         ‘pagebar’:”,
  19.         ‘pre_page’:‘0’,
  20.         ‘uid’:”
  21.     }
  22.     uid_list = []
  23.     charset = ‘utf8’

  24.     def get_msg(self,uid):
  25.         getWeiboPage.body[‘uid’] = uid
  26.         url = self.get_url(uid)
  27.         self.get_firstpage(url)
  28.         self.get_secondpage(url)
  29.         self.get_thirdpage(url)
  30.     def get_firstpage(self,url):
  31.         getWeiboPage.body[‘pre_page’] = getWeiboPage.body[‘page’]-1
  32.         url = url +urllib.urlencode(getWeiboPage.body)
  33.         req = urllib2.Request(url)
  34.         result = urllib2.urlopen(req)
  35.         text = result.read()
  36.         self.writefile(‘./output/text1’,text)
  37.         self.writefile(‘./output/result1’,eval(“u”'”+text+“”'”))

  38.     def get_secondpage(self,url):
  39.         getWeiboPage.body[‘count’] = ’15’
  40.     #   getWeiboPage.body[‘end_id’] = ‘3490160379905732’
  41.     #   getWeiboPage.body[‘max_id’] = ‘3487344294660278’
  42.         getWeiboPage.body[‘pagebar’] = ‘0’
  43.         getWeiboPage.body[‘pre_page’] = getWeiboPage.body[‘page’]

  44.         url = url +urllib.urlencode(getWeiboPage.body)
  45.         req = urllib2.Request(url)
  46.         result = urllib2.urlopen(req)
  47.         text = result.read()
  48.         self.writefile(‘./output/text2’,text)
  49.         self.writefile(‘./output/result2’,eval(“u”'”+text+“”'”))
  50.     def get_thirdpage(self,url):
  51.         getWeiboPage.body[‘count’] = ’15’
  52.         getWeiboPage.body[‘pagebar’] = ‘1’
  53.         getWeiboPage.body[‘pre_page’] = getWeiboPage.body[‘page’]

  54.         url = url +urllib.urlencode(getWeiboPage.body)
  55.         req = urllib2.Request(url)
  56.         result = urllib2.urlopen(req)
  57.         text = result.read()
  58.         self.writefile(‘./output/text3’,text)
  59.         self.writefile(‘./output/result3’,eval(“u”'”+text+“”'”))
  60.     def get_url(self,uid):
  61.         url = ‘http://weibo.com/’ + uid + ‘?from=otherprofile&wvr=3.6&loc=tagweibo’
  62.         return url
  63.     def get_uid(self,filename):
  64.         fread = file(filename)
  65.         for line in fread:
  66.             getWeiboPage.uid_list.append(line)
  67.             print line
  68.             time.sleep(1)
  69.     def writefile(self,filename,content):
  70.         fw = file(filename,‘w’)
  71.         fw.write(content)
  72.         fw.close()

在刚刚的main.py中加入相应内容,完整内容为:

  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-

  3. import weiboLogin
  4. import getWeiboMsg
  5. import urllib
  6. import urllib2

  7. username = ‘你的微博用户名’
  8. pwd = ‘你的微博密码’

  9. WBLogin = weiboLogin.weiboLogin()
  10. WBLogin.login(username, pwd)

  11. WBmsg = getWeiboMsg.getWeiboMsg()
  12. url = ‘http://weibo.com/1624087025?from=otherprofile&wvr=3.6&loc=tagweibo’

  13. WBmsg.get_firstpage(url)
  14. WBmsg.get_secondpage(url)
  15. WBmsg.get_thirdpage(url)

数据分析咨询请扫描二维码

客服在线
立即咨询