python-筛选出论文中引用的中英文文献

中文文献引用格式:
比如 张三(2013)、张三和李四(2014)、张三等(2015)
英文文献引用格式:

比如 Zhang(2013)、Yu and Zhang(2014)、Zhang et al.(2015)


# 读取word文件需要安装python-docx包:pip3 install python-docx

import docx
import re
import os
#获取word对象
file=docx.Document("论文测试.docx")

# 中文引用
match_cn=list()
match_en=list()
for para in file.paragraphs:
    # print(para.text)
    # 中文文献
    match_1=re.findall('[\u4e00-\u9fa5]{4}(\d\d\d\d)', para.text)
    match_2=re.findall('[\u4e00-\u9fa5]{2,3}和[\u4e00-\u9fa5]{2,3}(\d\d\d\d)', para.text)
    match_cn=match_cn+match_1+match_2
    #英文文献
    match_3=re.findall('[^\u4e00-\u9fa5,、。:;]+(\d\d\d\d)', para.text)
    match_4=re.findall('[^\u4e00-\u9fa5,、。:;]+ and [^\u4e00-\u9fa5,、。:;]+(\d\d\d\d)', para.text)
    match_en=match_en+match_3+match_4
# 删除列表空字符
while '' in match_cn:
    match_cn.remove('')
while '' in match_en:
    match_en.remove('')
# 删除列表重复字符
match_cn=list(set(match_cn))
match_en=list(set(match_en))
# 列表排序
match_cn.sort()#其实这条命令没什么用,一般参考文献中中文文献排序是按第一个中文字的字母排的,但是可以复制到excel中排序
match_en.sort()

match=match_cn+match_en
match_remove=list() #保存需要剔除的字符串

for str1 in match:
    for str2 in match:
        if (str1 in str2) & (str1!=str2):
            print('需要删除字符串:',str1)
            match_remove.append(str1)


for str in match_remove:
    try:
        while str in match:
            match.remove(str) #remove 列表遍历删除
    except:
        continue
if os.path.isfile('quote.txt'):
    os.remove('quote.txt')
for note in match:
    output=open('quote.txt','a')
    output.write(note+'\n')
output.close()
待后续修改...

你可能感兴趣的:(python)