1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
| import urllib.request import xlwt
def url_open(url): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"} r = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(r) html = response.read() return html
def find_title(url): html = url_open(url).decode('utf-8') title_addrs = [] div_all = html.find('<div class="rich_media_content " id="js_content" style="visibility:') section_data1 = html.find('<section data-width="100%" ',div_all) rgb1 = html.find('rgb(171, 25, 66)',section_data1) #section_2 = html.find('</section>',section_1) #print(html[section_1:section_2+9]) # h1 = html.find('<h1',section_1) # h2 = html.find('</h1>',h1) strong2 = html.find('</strong>',rgb1) #print('初始strong2:'+str(strong2)) strong1 = html.rfind('>',0,strong2) #print('初始strong1:'+str(strong1)) strong = strong2 - (strong1+1) #print(strong) br = '<br />' error_strong1 = '<strong><span' xu_title = '<h1 data-spm' n = 0 while n < 350: if strong > 0: rgb2 = html.find('rgb(171, 25, 66)',rgb1+1) #print(rgb2) rgbstrong = rgb2-rgb1 strong2 = html.find('</strong>',rgb1) strong1 = html.rfind('>',0,strong2) title1 = html[strong1+1:strong2] #rgb1 = rgb2 #strong = strong2 - (strong1+1) #print(str(n)+'___rgbstrong:'+str(rgbstrong)) if rgbstrong <1000: #print('小于500') strong2 = html.find('</strong>',rgb2) strong1 = html.rfind('>',0,strong2) title2 = html[strong1+1:strong2] title = title1 + title2 rgb1 = html.find('rgb(171, 25, 66)',rgb2+1) #print('rgb1:'+str(rgb1)) strong2 = html.find('</strong>',rgb1) strong1 = html.rfind('>',0,strong2) strong = strong2 - (strong1+1) print(title) #print(str(n)+'—if—'+title) else: print(title1) #print(str(n)+'—else—'+title1) rgb1 = html.find('rgb(171, 25, 66)',rgb1+1) strong2 = html.find('</strong>',rgb1) strong1 = html.rfind('>',0,strong2) strong = strong2 - (strong1+1) n += 1 elif strong == 0 : #print('elif strong____n:'+str(n)) #title = html[strong1+1:strong2] #print(title) rgb1 = html.find('rgb(171, 25, 66)',rgb1+1) strong2 = html.find('</strong>',rgb1) strong1 = html.rfind('>',0,strong2) strong = strong2 - (strong1+1) n += 1
def download_mm(): url = 'https://mp.weixin.qq.com/s/wLEpnB6c3Y54WA_PIsvLlg' #url = 'https://mp.weixin.qq.com/s/6t_GG2uS7igwWw6OHv6v0g' find_title(url) if __name__== '__main__': download_mm()
|