Scrapy模拟登录赶集网的实现代码

  import scrapy

  import re

  class GanjiSpider(scrapy.Spider):

  name = 'ganji'

  allowed_domains = ['ganji.com']

  start_urls = ['https://passport.ganji.com/login.php']

  def parse(self, response):

  hash_code = re.search(r'"__hash__":"(.+)"}', response.text).group(1) # 正则获取哈希

  img_url = 'https://passport.ganji.com/ajax.php?dir=captcha&module=login_captcha' # 验证码url

  yield scrapy.Request(img_url, callback=self.do_formdata, meta={'hash_code': hash_code}) # 发送获取验证码请求并保存验证码到本地

  def do_formdata(self, response):

  with open('yzm.jpg', 'wb') as f:

  f.write(response.body)

  # 验证码三种方案:1,保存下来手动输入,2,云打码,3 tesseract模块,在这里我们手动输入

  code = input('请输入验证码:')

  # 创建表单

  formdata = {

  'username': 'your_username',

  'password': 'your_password',

  'setcookie': '14',

  'checkCode': code,

  'next': '',

  'source': 'passport',

  '__hash__': response.request.meta['hash_code'] # meta是在respose.request中

  }

  login_url = "https://passport.ganji.com/login.php"

  yield scrapy.FormRequest(url=login_url, formdata=formdata, callback=self.after_login) # 发送登录请求

  def after_login(self, response):

  print(response.text)