arxiv论文整理工具

可以自动从arxiv获取各大顶会论文

自动下载论文
摘要提取
摘要翻译
代码获取
整理导出pdf

代码

必须修改变量 file_name = ‘papers.txt’

papers.txt为需要整理的论文名

papers.txt标准格式

第一列为论文分类第二列为论文名其余可空

| --------- | ------------------------------------------------------------ | -------- | ---- | -------- |
|           |                                                              |          |      |          |
|           |                                                              |          |      |          |
| ACL2020   | Improving Massively Multilingual Neural Machine Translation and Zero-Shot Translation |          |      |          |
| ACL2020   | Simultaneous Translation Policies: From Fixed to Adaptive    |          |      |          |
| ACL2020   | Multiscale Collaborative Deep Models for Neural Machine Translation |          |      |          |
| ACL2020   | Character-Level Translation with Self-attention              |          |      |          |
| ACL2020   | Learning to Recover from Multi-Modality Errors for Non-Autoregressive Neural Machine Translation |          |      |          |
| ACL2020   | ENGINE: Energy-Based Inference Networks for Non-Autoregressive Machine Translation |          |      |          |
| ACL2020   | Selecting Backtranslated Data from Multiple Sources for Improved Neural Machine Translation |          |      |          |
| ACL2020   | Variational Neural Machine Translation with Normalizing Flows |          |      |          |

具体实现

import html
import json
import os
import random
import re
import time
import urllib.request
from urllib import parse

import chardet
import requests
from lxml import etree
from tqdm import tqdm

# title = 'Diversifying Dialogue Generation with Non-Conversational Text'
file_name = 'source_paper/COLING_摘要.txt'
output_file_name = 'COLING_摘要'
download_path = 'downloads/'
GOOGLE_TRANSLATE_URL = 'http://translate.google.cn/m?q=%s&tl=%s&sl=%s'


# 读取论文目录文件
def get_paper_name(file_name_fun):
    paper_name_fun = []
    # 论文 会议/期刊
    paper_class_fun = []
    with open(file_name_fun, 'r') as f:
        file_info = f.readlines()
    for i in file_info:
        temp_read = str(i.split('|')[2]).replace(' ', '')
        # print(temp_read)
        if not (temp_read.__contains__('---') or temp_read.__eq__('')):
            paper_name_fun.append(i.split('|')[2].strip())
            paper_class_fun.append(i.split('|')[1].strip())
    print('读取到 ' + str(len(paper_name_fun)) + ' 篇论文')
    return paper_name_fun, paper_class_fun


# 根据论文名从arxiv获取论文下载链接
# 获取作者信息
# 获取摘要
# 翻译摘要
def get_paper_urls_authors_abstract(title_fun):
    url = 'https://arxiv.org/search/?query=' + title_fun.replace(' ',
                                                                 '+') + '&searchtype=title&abstracts=show&order=-announced_date_first&size=50'
    try:
        # time.sleep(1)
        html_fun = urllib.request.urlopen(url).read().decode('utf-8')
        # time.sleep(1)

        dom = etree.HTML(html_fun, etree.HTMLParser(encoding='utf-8'))
        # title = dom.xpath('//*[@id="main-container"]/div[2]/ol/li/p[1]/span/text()')  # 论文名
        paper_url_fun = dom.xpath('//*[@id="main-container"]/div[2]/ol/li/div/p/a/@href')
        # 判断论文是否可检索
        if len(paper_url_fun) != 0:
            download_url_fun = dom.xpath('//*[@id="main-container"]/div[2]/ol/li/div/p/span/a[1]/@href')
            author_fun_temp = dom.xpath('//*[@id="main-container"]/div[2]/ol/li/p[2]/a/text()')
            author_fun = ''
            for i in author_fun_temp:
                author_fun = author_fun + "   " + i
            author_fun = author_fun.strip()
            abstract_fun_temp = dom.xpath('//*[@id="main-container"]/div[2]/ol/li/p[3]/span[3]/text()')
            abstract_fun = str(abstract_fun_temp[0]).strip()
            abstract_translate_fun = translate(abstract_fun, "en", "zh-CN")
            return title_fun, paper_url_fun, download_url_fun, author_fun, abstract_fun, abstract_translate_fun
        else:
            print('\n论文不可检索！！！')
            return '', '', '', '', '', ''
    except:
        return '', '', '', '', '', ''


# 下载论文
def download_paper(download_path_fun, download_url_fun, title_fun):
    temp_download_url = tqdm(download_url_fun)
    # if not os.path.exists(download_path_fun + file_name.split('.')[0]):
    #     os.makedirs(download_path_fun + file_name.split('.')[0])
    # download_path_fun = download_path_fun + file_name.split('.')[0] + '/'
    for url in temp_download_url:
        if not str(url).__eq__(''):
            temp_download_url.set_description("\n正在下载： %s" % url[0])
            r = requests.get(url[0])
            while r.status_code == 403:
                time.sleep(500 + random.uniform(0, 500))
                r = requests.get(url[0])
            with open(download_path_fun + str(title_fun[download_url_fun.index(url)]) + '.pdf', "wb") as f:
                f.write(r.content)


# 获取官方github代码
def get_github_code(paper_url_fun):
    temp_paper_url_fun = tqdm(paper_url_fun)
    for url in temp_paper_url_fun:
        if not str(url).__eq__(''):
            temp_paper_url_fun.set_description("\n正在获取代码： %s" % url[0])
            url = 'https://arxiv.paperswithcode.com/api/v0/repos-and-datasets/' + url[0].split('/')[-1]
            rq = requests.get(url)
            rq.encoding = chardet.detect(rq.content)['encoding']
            try:
                code = json.loads(rq.text)['code']['official']['url']
            except:
                code = ''
            github_code.append(code)
        else:
            github_code.append('')


# 翻译摘要
def translate(text, text_language="auto", to_language="auto"):
    text = parse.quote(text)
    url = GOOGLE_TRANSLATE_URL % (text, to_language, text_language)
    response = requests.get(url)
    data = response.text
    expr = r'(?s)class="(?:t0|result-container)">(.*?)<'
    result = re.findall(expr, data)
    if len(result) == 0:
        return ""

    return html.unescape(result[0])


# 导出md文件
def save_md(out_file_fun, title_fun, paper_class_fun, paper_url_fun, download_url_fun, github_code_fun):
    exclude_paper = 0
    with open(out_file_fun, 'w+') as f:
        f.writelines('| 序号 | 会议/期刊 | 论文 | 主要技术 | 代码 | 论文下载地址 | 摘要 | 摘要翻译 | 作者 |\n')
        f.writelines('| --- | --- | --- | --- | --- | --- | --- | --- | --- |\n')
        for i in range(len(title_fun)):
            if not str(title_fun[i]).__eq__(''):
                if str(download_url_fun[i]).__eq__(''):
                    download_url_fun[i] = ['']
                md_str = '| ' + str(i + 1 - exclude_paper) + ' | ' + str(paper_class_fun[i]) + ' | [' + str(
                    title_fun[i]) + '](' + str(
                    paper_url_fun[i]) + ') |      | ' + str(
                    github_code_fun[i]) + ' | ' + str(download_url_fun[i][0]) + ' | ' + str(abstract[i]) + ' | ' + str(
                    abstract_translate[i]) + ' | ' + str(author[i]) + ' |\n'
                f.writelines(md_str)
            else:
                exclude_paper += 1


def save_md_simple(out_file_fun, title_fun, paper_class_fun, paper_url_fun, download_url_fun, github_code_fun):
    exclude_paper = 0
    with open(out_file_fun, 'w+') as f:
        f.writelines('| 序号 | 会议/期刊 | 论文 | 主要技术 | 代码 | 论文下载地址 |\n')
        f.writelines('| --- | --- | --- | --- | --- | --- |\n')
        for i in range(len(title_fun)):
            if not str(title_fun[i]).__eq__(''):
                if str(download_url_fun[i]).__eq__(''):
                    download_url_fun[i] = ['']
                md_str = '| ' + str(i + 1 - exclude_paper) + ' | ' + str(paper_class_fun[i]) + ' | [' + str(
                    title_fun[i]) + '](' + str(
                    paper_url_fun[i]) + ') |      | ' + str(
                    github_code_fun[i]) + ' | ' + str(download_url_fun[i][0]) + ' |\n'
                f.writelines(md_str)
            else:
                exclude_paper += 1


if __name__ == '__main__':
    # main
    out_file_dir = 'data/' + output_file_name + '/'
    out_file = out_file_dir + output_file_name + '.md'
    out_file_simple = out_file_dir + output_file_name + '_simple.md'
    download_path = download_path + output_file_name + '/'
    # 创建输出文件夹
    if not os.path.exists(out_file_dir):
        os.makedirs(out_file_dir)
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    title = []
    paper_url = []
    download_url = []
    github_code = []
    author = []
    abstract = []
    abstract_translate = []
    print('\n读取论文目录文件')

    temp_file_name, paper_class = get_paper_name(file_name)
    temp_file_name = tqdm(temp_file_name)
    print('\n根据论文名从arxiv获取论文链接 作者信息 摘要 摘要翻译')
    for title_name in temp_file_name:
        temp_file_name.set_description("\n正在获取： %s" % title_name)
        time.sleep(1)
        paper_urls = get_paper_urls_authors_abstract(title_name)
        title.append(paper_urls[0])
        paper_url.append(paper_urls[1])
        download_url.append(paper_urls[2])
        author.append(paper_urls[3])
        abstract.append(paper_urls[4])
        abstract_translate.append(paper_urls[5])
    # 下载论文
    print('\n下载论文')
    download_paper(download_path, download_url, title)
    print('\n获取github代码')
    get_github_code(paper_url)
    # 保存md
    print('\n导出md文件')
    # 更新文件保存地址

    save_md(out_file, title, paper_class, paper_url, download_url, github_code)
    print(out_file)
    # 不保存 摘要 摘要翻译 作者信息
    save_md_simple(out_file_simple, title, paper_class, paper_url, download_url, github_code)
    print(out_file_simple)

    print('\n进程结束!')