arxiv论文整理工具

可以自动从arxiv获取各大顶会论文

  • 自动下载论文
  • 摘要提取
  • 摘要翻译
  • 代码获取
  • 整理导出pdf

代码

必须修改变量 file_name = ‘papers.txt’

papers.txt为需要整理的论文名

  • papers.txt标准格式

    第一列为论文分类 第二列为论文名 其余可空

1
2
3
4
5
6
7
8
9
10
11
| --------- | ------------------------------------------------------------ | -------- | ---- | -------- |
| | | | | |
| | | | | |
| ACL2020 | Improving Massively Multilingual Neural Machine Translation and Zero-Shot Translation | | | |
| ACL2020 | Simultaneous Translation Policies: From Fixed to Adaptive | | | |
| ACL2020 | Multiscale Collaborative Deep Models for Neural Machine Translation | | | |
| ACL2020 | Character-Level Translation with Self-attention | | | |
| ACL2020 | Learning to Recover from Multi-Modality Errors for Non-Autoregressive Neural Machine Translation | | | |
| ACL2020 | ENGINE: Energy-Based Inference Networks for Non-Autoregressive Machine Translation | | | |
| ACL2020 | Selecting Backtranslated Data from Multiple Sources for Improved Neural Machine Translation | | | |
| ACL2020 | Variational Neural Machine Translation with Normalizing Flows | | | |
  • 具体实现
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import html
import json
import os
import random
import re
import time
import urllib.request
from urllib import parse

import chardet
import requests
from lxml import etree
from tqdm import tqdm

# title = 'Diversifying Dialogue Generation with Non-Conversational Text'
file_name = 'source_paper/COLING_摘要.txt'
output_file_name = 'COLING_摘要'
download_path = 'downloads/'
GOOGLE_TRANSLATE_URL = 'http://translate.google.cn/m?q=%s&tl=%s&sl=%s'


# 读取论文目录文件
def get_paper_name(file_name_fun):
paper_name_fun = []
# 论文 会议/期刊
paper_class_fun = []
with open(file_name_fun, 'r') as f:
file_info = f.readlines()
for i in file_info:
temp_read = str(i.split('|')[2]).replace(' ', '')
# print(temp_read)
if not (temp_read.__contains__('---') or temp_read.__eq__('')):
paper_name_fun.append(i.split('|')[2].strip())
paper_class_fun.append(i.split('|')[1].strip())
print('读取到 ' + str(len(paper_name_fun)) + ' 篇论文')
return paper_name_fun, paper_class_fun


# 根据论文名从arxiv获取论文下载链接
# 获取作者信息
# 获取摘要
# 翻译摘要
def get_paper_urls_authors_abstract(title_fun):
url = 'https://arxiv.org/search/?query=' + title_fun.replace(' ',
'+') + '&searchtype=title&abstracts=show&order=-announced_date_first&size=50'
try:
# time.sleep(1)
html_fun = urllib.request.urlopen(url).read().decode('utf-8')
# time.sleep(1)

dom = etree.HTML(html_fun, etree.HTMLParser(encoding='utf-8'))
# title = dom.xpath('//*[@id="main-container"]/div[2]/ol/li/p[1]/span/text()') # 论文名
paper_url_fun = dom.xpath('//*[@id="main-container"]/div[2]/ol/li/div/p/a/@href')
# 判断论文是否可检索
if len(paper_url_fun) != 0:
download_url_fun = dom.xpath('//*[@id="main-container"]/div[2]/ol/li/div/p/span/a[1]/@href')
author_fun_temp = dom.xpath('//*[@id="main-container"]/div[2]/ol/li/p[2]/a/text()')
author_fun = ''
for i in author_fun_temp:
author_fun = author_fun + " " + i
author_fun = author_fun.strip()
abstract_fun_temp = dom.xpath('//*[@id="main-container"]/div[2]/ol/li/p[3]/span[3]/text()')
abstract_fun = str(abstract_fun_temp[0]).strip()
abstract_translate_fun = translate(abstract_fun, "en", "zh-CN")
return title_fun, paper_url_fun, download_url_fun, author_fun, abstract_fun, abstract_translate_fun
else:
print('\n论文不可检索!!!')
return '', '', '', '', '', ''
except:
return '', '', '', '', '', ''


# 下载论文
def download_paper(download_path_fun, download_url_fun, title_fun):
temp_download_url = tqdm(download_url_fun)
# if not os.path.exists(download_path_fun + file_name.split('.')[0]):
# os.makedirs(download_path_fun + file_name.split('.')[0])
# download_path_fun = download_path_fun + file_name.split('.')[0] + '/'
for url in temp_download_url:
if not str(url).__eq__(''):
temp_download_url.set_description("\n正在下载: %s" % url[0])
r = requests.get(url[0])
while r.status_code == 403:
time.sleep(500 + random.uniform(0, 500))
r = requests.get(url[0])
with open(download_path_fun + str(title_fun[download_url_fun.index(url)]) + '.pdf', "wb") as f:
f.write(r.content)


# 获取官方github代码
def get_github_code(paper_url_fun):
temp_paper_url_fun = tqdm(paper_url_fun)
for url in temp_paper_url_fun:
if not str(url).__eq__(''):
temp_paper_url_fun.set_description("\n正在获取代码: %s" % url[0])
url = 'https://arxiv.paperswithcode.com/api/v0/repos-and-datasets/' + url[0].split('/')[-1]
rq = requests.get(url)
rq.encoding = chardet.detect(rq.content)['encoding']
try:
code = json.loads(rq.text)['code']['official']['url']
except:
code = ''
github_code.append(code)
else:
github_code.append('')


# 翻译摘要
def translate(text, text_language="auto", to_language="auto"):
text = parse.quote(text)
url = GOOGLE_TRANSLATE_URL % (text, to_language, text_language)
response = requests.get(url)
data = response.text
expr = r'(?s)class="(?:t0|result-container)">(.*?)<'
result = re.findall(expr, data)
if len(result) == 0:
return ""

return html.unescape(result[0])


# 导出md文件
def save_md(out_file_fun, title_fun, paper_class_fun, paper_url_fun, download_url_fun, github_code_fun):
exclude_paper = 0
with open(out_file_fun, 'w+') as f:
f.writelines('| 序号 | 会议/期刊 | 论文 | 主要技术 | 代码 | 论文下载地址 | 摘要 | 摘要翻译 | 作者 |\n')
f.writelines('| --- | --- | --- | --- | --- | --- | --- | --- | --- |\n')
for i in range(len(title_fun)):
if not str(title_fun[i]).__eq__(''):
if str(download_url_fun[i]).__eq__(''):
download_url_fun[i] = ['']
md_str = '| ' + str(i + 1 - exclude_paper) + ' | ' + str(paper_class_fun[i]) + ' | [' + str(
title_fun[i]) + '](' + str(
paper_url_fun[i]) + ') | | ' + str(
github_code_fun[i]) + ' | ' + str(download_url_fun[i][0]) + ' | ' + str(abstract[i]) + ' | ' + str(
abstract_translate[i]) + ' | ' + str(author[i]) + ' |\n'
f.writelines(md_str)
else:
exclude_paper += 1


def save_md_simple(out_file_fun, title_fun, paper_class_fun, paper_url_fun, download_url_fun, github_code_fun):
exclude_paper = 0
with open(out_file_fun, 'w+') as f:
f.writelines('| 序号 | 会议/期刊 | 论文 | 主要技术 | 代码 | 论文下载地址 |\n')
f.writelines('| --- | --- | --- | --- | --- | --- |\n')
for i in range(len(title_fun)):
if not str(title_fun[i]).__eq__(''):
if str(download_url_fun[i]).__eq__(''):
download_url_fun[i] = ['']
md_str = '| ' + str(i + 1 - exclude_paper) + ' | ' + str(paper_class_fun[i]) + ' | [' + str(
title_fun[i]) + '](' + str(
paper_url_fun[i]) + ') | | ' + str(
github_code_fun[i]) + ' | ' + str(download_url_fun[i][0]) + ' |\n'
f.writelines(md_str)
else:
exclude_paper += 1


if __name__ == '__main__':
# main
out_file_dir = 'data/' + output_file_name + '/'
out_file = out_file_dir + output_file_name + '.md'
out_file_simple = out_file_dir + output_file_name + '_simple.md'
download_path = download_path + output_file_name + '/'
# 创建输出文件夹
if not os.path.exists(out_file_dir):
os.makedirs(out_file_dir)
if not os.path.exists(download_path):
os.makedirs(download_path)
title = []
paper_url = []
download_url = []
github_code = []
author = []
abstract = []
abstract_translate = []
print('\n读取论文目录文件')

temp_file_name, paper_class = get_paper_name(file_name)
temp_file_name = tqdm(temp_file_name)
print('\n根据论文名从arxiv获取论文链接 作者信息 摘要 摘要翻译')
for title_name in temp_file_name:
temp_file_name.set_description("\n正在获取: %s" % title_name)
time.sleep(1)
paper_urls = get_paper_urls_authors_abstract(title_name)
title.append(paper_urls[0])
paper_url.append(paper_urls[1])
download_url.append(paper_urls[2])
author.append(paper_urls[3])
abstract.append(paper_urls[4])
abstract_translate.append(paper_urls[5])
# 下载论文
print('\n下载论文')
download_paper(download_path, download_url, title)
print('\n获取github代码')
get_github_code(paper_url)
# 保存md
print('\n导出md文件')
# 更新文件保存地址

save_md(out_file, title, paper_class, paper_url, download_url, github_code)
print(out_file)
# 不保存 摘要 摘要翻译 作者信息
save_md_simple(out_file_simple, title, paper_class, paper_url, download_url, github_code)
print(out_file_simple)

print('\n进程结束!')