​ 缘起一个好物推送公众号的页面(不晓得这样做会不会涉及到什么侵权啥啥啥的,如果请一定赶紧告诉我下。。。)因为之前学习python,不怎么习惯没有分类的超多好物,所以想通过爬虫进行相应的简单筛选自己需要的信息(名字、价格、链接等等)链接是:’https://mp.weixin.qq.com/s/RU-J2HlnYlAFX4cnt-ELNA 是一个叫钱抠抠的公众号。

总的目标是这样的:1、获取促销物品标题 2、获取促销价格 3、获取链接 4、整合起来然后塞到excel做成一个表。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import urllib.request
import xlwt



def url_open(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"}
r = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(r)
html = response.read()
return html


def find_title(url):
html = url_open(url).decode('utf-8')
title_addrs = []

div_all = html.find('<div class="rich_media_content " id="js_content" style="visibility:')

section_data1 = html.find('<section data-width="100%" ',div_all)

rgb1 = html.find('rgb(171, 25, 66)',section_data1)
#section_2 = html.find('</section>',section_1)
#print(html[section_1:section_2+9])
# h1 = html.find('<h1',section_1)
# h2 = html.find('</h1>',h1)
strong2 = html.find('</strong>',rgb1)

#print('初始strong2:'+str(strong2))
strong1 = html.rfind('>',0,strong2)

#print('初始strong1:'+str(strong1))
strong = strong2 - (strong1+1)
#print(strong)
br = '<br />'
error_strong1 = '<strong><span'
xu_title = '<h1 data-spm'
n = 0


while n < 350:
if strong > 0:
rgb2 = html.find('rgb(171, 25, 66)',rgb1+1)
#print(rgb2)
rgbstrong = rgb2-rgb1


strong2 = html.find('</strong>',rgb1)

strong1 = html.rfind('>',0,strong2)

title1 = html[strong1+1:strong2]
#rgb1 = rgb2

#strong = strong2 - (strong1+1)
#print(str(n)+'___rgbstrong:'+str(rgbstrong))
if rgbstrong <1000:

#print('小于500')
strong2 = html.find('</strong>',rgb2)
strong1 = html.rfind('>',0,strong2)
title2 = html[strong1+1:strong2]
title = title1 + title2
rgb1 = html.find('rgb(171, 25, 66)',rgb2+1)
#print('rgb1:'+str(rgb1))

strong2 = html.find('</strong>',rgb1)
strong1 = html.rfind('>',0,strong2)
strong = strong2 - (strong1+1)

print(title)
#print(str(n)+'—if—'+title)
else:
print(title1)
#print(str(n)+'—else—'+title1)
rgb1 = html.find('rgb(171, 25, 66)',rgb1+1)
strong2 = html.find('</strong>',rgb1)
strong1 = html.rfind('>',0,strong2)
strong = strong2 - (strong1+1)



n += 1
elif strong == 0 :
#print('elif strong____n:'+str(n))
#title = html[strong1+1:strong2]
#print(title)
rgb1 = html.find('rgb(171, 25, 66)',rgb1+1)
strong2 = html.find('</strong>',rgb1)

strong1 = html.rfind('>',0,strong2)
strong = strong2 - (strong1+1)

n += 1




def download_mm():
url = 'https://mp.weixin.qq.com/s/wLEpnB6c3Y54WA_PIsvLlg'
#url = 'https://mp.weixin.qq.com/s/6t_GG2uS7igwWw6OHv6v0g'
find_title(url)

if __name__== '__main__':
download_mm()

这里暂时(勉强)完成第一步需求