1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
import time import requests import re import os.path import pickle import logging import sys from datetime import datetime from bs4 import BeautifulSoup from pyinstapaper.instapaper import Instapaper, Folder, Bookmark
reload(sys) sys.setdefaultencoding('utf-8')
INSTAPAPER_KEY = '************************' INSTAPAPER_SECRET = '*********************' INSTAPAPER_LOGIN = '[email protected]' INSTAPAPER_PASSWORD = 'password'
novel_list = [ "苏厨", "王老实的幸福生活", "大魔王又出手了" ] novel_url = [ '392_392855', '7_7669', '431_431648' ] base_url = 'https://m.xinxs.la'
def add_bookmark_instapaper(title, url): data = { 'time': time.time(), 'progress_timestamp': 0, 'title': title, 'url': url } bookmark = Bookmark(instapaper, **data) bookmark.add()
def fetch_novel(novel_list, novel_url): print('job running') if os.path.isfile('url.pkl'): with open('url.pkl') as f: last_url = pickle.load(f) f.close() else: last_url = [[]] * len(novel_list)
url_archive = [] for j in range(0, len(novel_list)): print(novel_list[j]) try: old_url = last_url[j] except IndexError: old_url = [] url = base_url + '/' + novel_url[j] + '/' head = {} head[ 'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19' page = requests.get(url) soup = BeautifulSoup(page.content, 'lxml') soup_text = soup.select( 'body > div.container > div.row.row-section > div > div:nth-of-type(1) > ul > li > a' ) latest_url = [] latest_title = [] for i in range(0, len(soup_text)): latest_url.append(base_url + soup_text[i]['href']) latest_title.append(novel_list[j] + '---' + soup_text[i].string.encode('utf-8'))
for k in range(0, len(latest_url)): if latest_url[k] in old_url: continue print(latest_title[k], latest_url[k], latest_url[k].replace(".html", "_2.html")) add_bookmark_instapaper( latest_title[k] + " part 2", latest_url[k].replace(".html", "_2.html") ) add_bookmark_instapaper(latest_title[k], latest_url[k])
old_url = latest_url url_archive.append(old_url)
with open('url.pkl', 'w') as f: pickle.dump(url_archive, f) f.close()
bookmarks = instapaper.get_bookmarks('unread') for ct, bookmark in enumerate(bookmarks): print(bookmark.title) bookmark.archive() bookmark.delete()
fetch_novel(novel_list, novel_url)