爬虫2 | Prjoker

缘起一个好物推送公众号的页面（不晓得这样做会不会涉及到什么侵权啥啥啥的，如果请一定赶紧告诉我下。。。）因为之前学习python，不怎么习惯没有分类的超多好物，所以想通过爬虫进行相应的简单筛选自己需要的信息（名字、价格、链接等等）链接是：’https://mp.weixin.qq.com/s/RU-J2HlnYlAFX4cnt-ELNA 是一个叫钱抠抠的公众号。

总的目标是这样的：1、获取促销物品标题 2、获取促销价格 3、获取链接 4、整合起来然后塞到excel做成一个表。

import urllib.request
import xlwt



def url_open(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"}
    r = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(r)
    html = response.read()
    return html


def find_title(url):
    html = url_open(url).decode('utf-8')
    title_addrs = []
    
    div_all = html.find('<div class="rich_media_content " id="js_content" style="visibility:')
    
    section_data1 = html.find('<section data-width="100%" ',div_all)
    
    rgb1 = html.find('rgb(171, 25, 66)',section_data1)
    #section_2 = html.find('</section>',section_1)
    #print(html[section_1:section_2+9])
#     h1 = html.find('<h1',section_1)
#     h2 = html.find('</h1>',h1)
    strong2 = html.find('</strong>',rgb1)
    
    #print('初始strong2：'+str(strong2))
    strong1 = html.rfind('>',0,strong2)
          
    #print('初始strong1：'+str(strong1))
    strong = strong2 - (strong1+1)
    #print(strong)
    br = '<br  />'
    error_strong1 = '<strong><span'
    xu_title = '<h1 data-spm'
    n = 0
          
    
    while n < 350:
        if strong > 0:
            rgb2 = html.find('rgb(171, 25, 66)',rgb1+1)
            #print(rgb2)
            rgbstrong = rgb2-rgb1
            
            
            strong2 = html.find('</strong>',rgb1)
            
            strong1 = html.rfind('>',0,strong2)
            
            title1 = html[strong1+1:strong2]
            #rgb1 = rgb2
            
            #strong = strong2 - (strong1+1)
            #print(str(n)+'___rgbstrong:'+str(rgbstrong))
            if rgbstrong <1000:
                
                #print('小于500')
                strong2 = html.find('</strong>',rgb2)
                strong1 = html.rfind('>',0,strong2)
                title2 = html[strong1+1:strong2]
                title = title1 + title2
                rgb1 = html.find('rgb(171, 25, 66)',rgb2+1)
                #print('rgb1：'+str(rgb1))
                
                strong2 = html.find('</strong>',rgb1)
                strong1 = html.rfind('>',0,strong2)
                strong = strong2 - (strong1+1)
                
                print(title)
                #print(str(n)+'—if—'+title)
            else:
                print(title1)
                #print(str(n)+'—else—'+title1)
                rgb1 = html.find('rgb(171, 25, 66)',rgb1+1)
                strong2 = html.find('</strong>',rgb1)
                strong1 = html.rfind('>',0,strong2)
                strong = strong2 - (strong1+1)
            
            
            
            n += 1
        elif strong == 0 :
            #print('elif strong____n:'+str(n))
            #title = html[strong1+1:strong2]
            #print(title)
            rgb1 = html.find('rgb(171, 25, 66)',rgb1+1)
            strong2 = html.find('</strong>',rgb1)
    
            strong1 = html.rfind('>',0,strong2)
            strong = strong2 - (strong1+1)
        
            n += 1
                         

    

def download_mm():
    url = 'https://mp.weixin.qq.com/s/wLEpnB6c3Y54WA_PIsvLlg'
    #url = 'https://mp.weixin.qq.com/s/6t_GG2uS7igwWw6OHv6v0g'
    find_title(url)
    
if __name__== '__main__':
    download_mm()

这里暂时（勉强）完成第一步需求