用 Kindle 来追网文之二

背景

前面的文章 用 kindle 追网文 提到了一种用 kindle 追网文的方式,正常工作过一段时间之后,情况又发生变化了:每个章节都被拆成两个页面。这样就导致我只抓了每个章节的第一个页面,第二个页面没抓,自然内容也就不完整了。这部分内容主要是对 用 kindle 追网文 的代码做了些简单的改动,以适应最新情况。

新方案

其他部分都没变,只是 Python 程序稍稍修改了一下(同时还做了下格式的修改,显得更专业一些:),具体代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import time
import requests
import re
import os.path
import pickle
import logging
import sys
from datetime import datetime
from bs4 import BeautifulSoup
from pyinstapaper.instapaper import Instapaper, Folder, Bookmark

reload(sys)
sys.setdefaultencoding('utf-8')

# 以下四个变量根据自己的情况填写
INSTAPAPER_KEY = '************************'
INSTAPAPER_SECRET = '*********************'
INSTAPAPER_LOGIN = '[email protected]'
INSTAPAPER_PASSWORD = 'password'

# 几本书,用来做例子
novel_list = [
"苏厨",
"王老实的幸福生活",
"大魔王又出手了"
]
novel_url = [
'392_392855',
'7_7669',
'431_431648'
]
base_url = 'https://m.xinxs.la'

instapaper = Instapaper(INSTAPAPER_KEY, INSTAPAPER_SECRET)
instapaper.login(INSTAPAPER_LOGIN, INSTAPAPER_PASSWORD)

def add_bookmark_instapaper(title, url):
data = {
'time': time.time(),
'progress_timestamp': 0,
'title': title,
'url': url
}
bookmark = Bookmark(instapaper, **data)
bookmark.add()


def fetch_novel(novel_list, novel_url):
print('job running')
if os.path.isfile('url.pkl'):
with open('url.pkl') as f:
last_url = pickle.load(f)
f.close()
else:
last_url = [[]] * len(novel_list)

url_archive = []
for j in range(0, len(novel_list)):
print(novel_list[j])
try:
old_url = last_url[j]
except IndexError:
old_url = []
url = base_url + '/' + novel_url[j] + '/'
head = {}
head[
'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')
soup_text = soup.select(
'body > div.container > div.row.row-section > div > div:nth-of-type(1) > ul > li > a'
)
latest_url = []
latest_title = []
for i in range(0, len(soup_text)):
latest_url.append(base_url + soup_text[i]['href'])
latest_title.append(novel_list[j] + '---' +
soup_text[i].string.encode('utf-8'))

for k in range(0, len(latest_url)):
if latest_url[k] in old_url:
continue
# latest_url[k].replace(".html", "_2.html") 是第二部分的 url
print(latest_title[k], latest_url[k],
latest_url[k].replace(".html", "_2.html"))
add_bookmark_instapaper(
latest_title[k] + " part 2",
latest_url[k].replace(".html", "_2.html")
)
add_bookmark_instapaper(latest_title[k], latest_url[k])

old_url = latest_url
url_archive.append(old_url)

with open('url.pkl', 'w') as f:
pickle.dump(url_archive, f)
f.close()


# 每次抓取新的文章之前,先把以前的删掉
bookmarks = instapaper.get_bookmarks('unread')
for ct, bookmark in enumerate(bookmarks):
print(bookmark.title)
bookmark.archive()
bookmark.delete()

fetch_novel(novel_list, novel_url)

OK 了。