python3 爬补天公益src 厂商名称和url

作者: print("") 分类: python 发布时间: 2018-06-02 22:16

获取厂商的地址:

注意: cookie 是已经登陆的cookie

import json
import requests
import time
from bs4 import BeautifulSoup

def spider():
    '''
    :return:
    '''
    headers = {
        'Host': 'butian.360.cn',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
        'Accept-Encoding': 'gzip, deflate',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'X-Requested-With': 'XMLHttpRequest',
        'Referer': 'http://butian.360.cn/Reward/pub//Message/send',
        'Cookie': '这里是已经登陆的cookie!!!!!',
        'Connection': 'keep-alive'
    }
    for i in range(1,149):
        data={
            'p': i,
            'token': ''
        }
        time.sleep(3)
        res = requests.post('http://butian.360.cn/Reward/pub/Message/send', data=data,headers=headers,timeout=(4,20))
        allResult = {}
        allResult = json.loads(res.text)
        currentPage = str(allResult['data']['current'])
        currentNum = str(len(allResult['data']['list']))
        print('正在获取第' + currentPage + '页厂商数据')
        print('本页共有' + currentNum + '条厂商')
        for num in range(int(currentNum)):
            print('厂商名字:'+allResult['data']['list'][int(num)]['company_name']+'\t\t厂商类型:'+allResult\
                  ['data']['list'][int(num)]['industry']+'\t\t厂商ID:'+allResult['data']['list'][int(num)]['company_id'])
            base='http://butian.360.cn/Loo/submit?cid='
            with open('id.txt','a') as f:
                f.write(base+allResult['data']['list'][int(num)]['company_id']+'\n')

if __name__=='__main__':
    data = {
            's': '1',
            'p': '1',
            'token': ''
        }
    res = requests.post('http://butian.360.cn/Reward/pub/Message/send', data=data)
    allResult = {}
    allResult = json.loads(res.text)
    allPages = str(allResult['data']['count'])
    print('共' + allPages + '页')
    spider()
  

如下:

执行完之后会有一个id.txt 

查看一下

获取完之后, 我们在用另一个py 获取厂商的名字和厂商的URL

import json
import requests
import time
from bs4 import BeautifulSoup
def Url():
    '''
    遍历所有的ID
    取得对应的域名
    保存为target.txt
    :return:
    '''
    headers={
        'Host':'butian.360.cn',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
        'Accept-Encoding': 'gzip, deflate',
        'Referer':'http://butian.360.cn/Reward/pub',
        'Cookie': '已经登陆的cookie!!!!',
        'Connection':'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Cache-Control':'max-age=0'
    }
    with open('id.txt','r') as f:
        for target in f.readlines():
            target=target.strip()
            getUrl=requests.get(target,headers=headers,timeout=(4,20))
            result=getUrl.text
            info=BeautifulSoup(result)
            url=info.find(name='input',attrs={"name":"host"})
            name = info.find(name='input', attrs={"name": "company_name"})
            lastUrl=url.attrs['value']
            print('厂商:' + name.attrs['value'] + '\t网址:' + url.attrs['value'])

            url2="'厂商:' %s '\t网址:' %s "%(name.attrs['value'],url.attrs['value'])
            with open('url2.txt','a') as liang:
                liang.write(url2+'\n')
            with open('target.txt','a') as t:
                t.write(lastUrl+'\n')
            time.sleep(3)
    print('The target is right!')
Url()

执行结果

执行完之后会有两个文件一个是url2.txt 一个是target.txt  

url2.txt 是记录着是厂商和url 

target.txt 是只记录url地址

完整的一个py如下:

import json
import requests
import time
from bs4 import BeautifulSoup

def spider():
    '''
    :return:
    '''
    headers = {
        'Host': 'butian.360.cn',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
        'Accept-Encoding': 'gzip, deflate',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'X-Requested-With': 'XMLHttpRequest',
        'Referer': 'http://butian.360.cn/Reward/pub//Message/send',
        'Cookie': '这里是已经登陆的cookie!!!!!',
        'Connection': 'keep-alive'
    }
    for i in range(1,149):
        data={
            'p': i,
            'token': ''
        }
        time.sleep(3)
        res = requests.post('http://butian.360.cn/Reward/pub/Message/send', data=data,headers=headers,timeout=(4,20))
        allResult = {}
        allResult = json.loads(res.text)
        currentPage = str(allResult['data']['current'])
        currentNum = str(len(allResult['data']['list']))
        print('正在获取第' + currentPage + '页厂商数据')
        print('本页共有' + currentNum + '条厂商')
        for num in range(int(currentNum)):
            print('厂商名字:'+allResult['data']['list'][int(num)]['company_name']+'\t\t厂商类型:'+allResult\
                  ['data']['list'][int(num)]['industry']+'\t\t厂商ID:'+allResult['data']['list'][int(num)]['company_id'])
            base='http://butian.360.cn/Loo/submit?cid='
            with open('id.txt','a') as f:
                f.write(base+allResult['data']['list'][int(num)]['company_id']+'\n')
def Url():
    '''
    遍历所有的ID
    取得对应的域名
    保存为target.txt
    :return:
    '''
    headers={
        'Host':'butian.360.cn',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
        'Accept-Encoding': 'gzip, deflate',
        'Referer':'http://butian.360.cn/Reward/pub',
        'Cookie': '已经登陆的cookie!!!! ',
        'Connection':'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Cache-Control':'max-age=0'
    }
    with open('id.txt','r') as f:
        for target in f.readlines():
            target=target.strip()
            getUrl=requests.get(target,headers=headers,timeout=(4,20))
            result=getUrl.text
            info=BeautifulSoup(result)
            url=info.find(name='input',attrs={"name":"host"})
            name = info.find(name='input', attrs={"name": "company_name"})
            lastUrl=url.attrs['value']
            print('厂商:' + name.attrs['value'] + '\t网址:' + url.attrs['value'])
			url2="'厂商:' %s '\t网址:' %s "%(name.attrs['value'],url.attrs['value'])
            with open('url2.txt','a') as liang:
                liang.write(url2+'\n')
            with open('target.txt','a') as t:
                t.write(lastUrl+'\n')
            time.sleep(3)
    print('The target is right!')
if __name__=='__main__':

    data = {
            's': '1',
            'p': '1',
            'token': ''
        }
    res = requests.post('http://butian.360.cn/Reward/pub/Message/send', data=data)
    allResult = {}
    allResult = json.loads(res.text)
    allPages = str(allResult['data']['count'])
    print('共' + allPages + '页')
    spider()
    Url()

如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!

3
说点什么

avatar
2 Comment threads
1 Thread replies
0 Followers
 
Most reacted comment
Hottest comment thread
3 Comment authors
print("")royvideo buzz du moment 2017 Recent comment authors
  Subscribe  
最新 最旧 得票最多
提醒
video buzz du moment 2017
游客
video buzz du moment 2017

Certains blogs proposent l’échange d’articles.

roy
游客
roy

不知道为什么一直报错
lastUrl=url.attrs[‘value’]
AttributeError: ‘NoneType’ object has no attribute ‘attrs’