python3.6.国家政策文本分析代码

根据学习至今的python,和导师吩咐的方向,一共做了5件事:

1.政府网http://www.gov.cn/index.htm中养老政策特殊文本爬取与保存。

2.基于的TF/IDF多文档关键词抽取。

-基于TF-IDF算法的关键词抽取(原文:https://blog.csdn.net/zhangyu132/article/details/52128924)

  import jieba.analyse

  jieba.analyse.extract_tags(sentence, topK=20, withWeight=False, allowPOS=())
    –sentence 为待提取的文本
    –topK 为返回几个 TF/IDF 权重最大的关键词,默认值为 20
    –withWeight 为是否一并返回关键词权重值,默认值为 False
    –allowPOS 仅包括指定词性的词,默认值为空,即不筛选

-基于TextRank算法的关键词提取

    jieba.analyse.textrank(sentence, topK=20, withWeight=False, allowPOS=(‘ns’, ‘n’, ‘vn’, ‘v’)) 直接使用,接口相同,注意默认过滤词性。
    –基本思想:
    1,将待抽取关键词的文本进行分词
    2,以固定窗口大小(默认为5,通过span属性调整),词之间的共现关系,构建图

 

import jieba
import jieba.analyse
import pandas as pd

for i in range(23): 
 text = open(r'D:\\python3.6.5\\pycharm\\main\\output\\txt\\'+str(i)+'.txt','r',encoding='utf-8').read()
 words=jieba.analyse.extract_tags(text, topK = 15,withWeight =True, allowPOS = ('ns', 'n', 'vn', 'v', 'nr', 'x'))
 for word in words:
#print('keywords based on TFIDF:'+'/'.join([word for word in words]))
    fileout = r'D:\python3.6.5\gongci\new1.txt'
    with open (fileout,'a') as fr:
      fr.write(word+'/')
 print(word,weight)

topK=15表示每篇文章抽取频率最高的前15个词。下一步处理共现时词与词中间需要'/',否则无法共现,然后将每篇抽取的放入同一个xlxs中,抽取结果如图:

python3.6.国家政策文本分析代码_第1张图片

3.关键词共现矩阵的生成。

import xlrd

#读入表格数据,返回形如['/././','././','/././','/././']格式的列表
def readxls_bycol(path,colnum):
#    path=r'D:\Ditys\python learning\学习任务\第4次任务20170222\test.xlsx' #少了个r有时候会抽风出错!!!!
#    colnum=2 #从0开始计数!!所以,第三列的序号为2
    xl = xlrd.open_workbook(path)  
    table = xl.sheets()[0]
    data = list(table.col_values(colnum)) 
    print(data)
    print('----------1---------')
    return data   

#处理表格数据, 返回无重复的所有出现过的关键词set    
def deal_data(data):
    data_list = []
    data_set = set()
    for i in data:
         data_list.extend(i.split('/'))
#    data_list.sort()   #!!!高亮, 升序排列??????     
    data_set=set(data_list)
    print(data_set)
    print('----------2---------')
    return data_set

#根据set,可建立一个二维列表,并填充其其一行以及第一列, 返回建好框架的二维列表
def creat_list_2d(data_set):
    i = len(data_set)+1
    #list1=[['' for x in range(i)] for y in range(i)]
    list_2d = [[0 for col in range(i)] for row in range(i)]  #建一个空二维列表的方法噢~
    n=1
    for row_1 in data_set:   
        list_2d[0][n] = row_1   #填充第一行数据
        n+=1
        if n == i:
            break
    print(list_2d)
    m=1
    print(data_set)
    for cols in data_set:    #填充第一列数据
        list_2d[m][0] = cols
        m += 1
        if m == i:
            break
    print(list_2d)   
    print('----------3---------')
    return list_2d


#计算共现次数, 填充二维列表~  返回填好的列表~ 
def count_data(list_2d,data,data_set):
    data_formted= []
    for i in data:
        data_formted.append(i.split('/'))
    print(data_formted)
    print('----------4---------')
    for row in range(1,len(data_set)):
        for col in range(1,len(data_set)):
            if row == col:
                continue
            else:
                counter = 0
                for i in data_formted:
                    if list_2d[col][0] in i and list_2d[0][row] in i :
                        counter += 1
                list_2d[row][col] = counter
    print(list_2d)    
    print('----------5---------')
    return list_2d

#把矩阵写进txt~~~~

def putdata_intotxt(path,matrix):
    with open(path,'w') as f :
        for row in range(0,len(matrix)):   
            for col in range(0,len(matrix)):#二维列表中的每一个元素都走一遍
                f.write(str(matrix[row][col]) + '\t')  #因为write()只接字符串类型啊
            f.write('\n')

def main():
    #path_xls = r'test.xlsx'     #---测试数据---
    #path_txt= r'共现矩阵.txt'    #---测试数据---
    path_xls = r'D:\python3.6.5\gongci\test.xlsx' #r不可少
    path_txt= r'D:\python3.6.5\gongci\关键词共现矩阵.xlsx'
    colnum = 0
    data = readxls_bycol(path_xls,colnum)
    data_set = deal_data(data)
    list_2d = creat_list_2d(data_set)
    matrix = count_data(list_2d,data,data_set)
    print(matrix)
    putdata_intotxt(path_txt,matrix)


if __name__=='__main__':
    main()
    print('你的文件夹多了一个共现矩阵的结果~快去看看吧XP')

结果如图

python3.6.国家政策文本分析代码_第2张图片

4.政策词云图的生成(这是一段单独的程序,也可以把上下步骤加进来)

# -*- encoding:utf-8 -*-
import urllib.request
from bs4 import BeautifulSoup  # 导入urllib库的request模块
import lxml                    #文档解析器
import time                    #时间模块
import os                      #os模块就是对操作系统进行操作
import matplotlib.pyplot as plt     #数学绘图库
import jieba               #分词库
from PIL import Image      #图片
from wordcloud import WordCloud, ImageColorGenerator   #词云库
from collections import Counter   #列表、字典、字符串等中计算元素重复的次数
import numpy as np                #科学计算

t = time.localtime(time.time())    #转换至当前时区;time.time():返回当前时间的时间戳;

foldername = str(t.__getattribute__("tm_year")) + "-" + str(t.__getattribute__("tm_mon")) +             "-" + str(t.__getattribute__("tm_mday"))+ "-" + str(t.__getattribute__("tm_ho             ur"))

picpath = 'D:\\python3.6.5\\pycharm\\main\\%s' % (foldername)


def txt(name, text):  # 定义函数名
    if not os.path.exists(picpath):  # 路径不存在时创建一个
        os.makedirs(picpath)
    savepath = picpath + '\\' + name + '.txt'
    file = open(savepath, 'a', encoding='utf-8')
    file.write(text)
    # print(text)
    file.close
    return  (picpath)

def get_text(bs):
    # 读取纯文本
    for p in bs.select('p'):
        t = p.get_text()
        # print(t)#输出文本
        txt('url2', t)

def FenCi(pathin,pathout1,pathout2,picturein,pictureout):
    text = open(pathin, "r", encoding='utf-8').read()# 1、读入txt文本数据

    #2、结巴分词,默认精确模式。可以添加自定义词典userdict.txt,然后jieba.load_userdict(file_name) ,file_name为文件类对象或自定义词典的路径
    #自定义词典格式和默认词库dict.txt一样,一个词占一行:每一行分三部分:词语、词频(可省略)、词性(可省略),用空格隔开,顺序不可颠倒
    cut_text = jieba.cut(text, cut_all=False)
    result = " ".join(cut_text)  
    #print(result)
    with open(pathout1, 'a', encoding='utf-8') as f:
        f.write(result)
    print("save")

    #3、wordcount
    with open(pathout1, 'r', encoding='utf-8') as fr:  # r:只读;w:只写
        data = jieba.cut(fr.read())
    data = dict(Counter(data))

    with open(pathout2, 'a', encoding='utf-8') as fw:  # 读入存储wordcount的文件路径
        for k, v in data.items():
            fw.write('%s,%d\n' % (k, v))

    # 4、初始化自定义背景图片
    image = Image.open(picturein)
    graph = np.array(image)

    # 5、产生词云图
    # 有自定义背景图:生成词云图由自定义背景图像素大小决定
    wc = WordCloud(font_path=r"D:\Python3.6.5\jieba\ttf\yahei.ttf", background_color='white', max_font_size=50,mask=graph)
    wc.generate(result)

    # 6、绘制文字的颜色以背景图颜色为参考
    image_color = ImageColorGenerator(graph)  # 从背景图片生成颜色值
    wc.recolor(color_func=image_color)
    wc.to_file(pictureout)

def readhtml(path): #读取网页文本
 res = urllib.request.urlopen(path)  # 调用urlopen()从服务器获取网页响应(respone),其返回的响应是一个实例
 html = res.read().decode('utf-8')  # 调用返回响应示例中的read(),可以读取html
 soupa = BeautifulSoup(html, 'lxml')
 result = soupa.find_all('div', class_='result')
 download_soup = BeautifulSoup(str(result), 'lxml')#使用查询结果再创建一个BeautifulSoup对象,对其继续进行解析

 urls = []
 url_all = download_soup.find_all('a')
 #抓取所有政策链接
 for a_url in url_all:
        a_url = a_url.get('href')
        urls.append(a_url)
        url = a_url
        txt('url0', a_url)
        res = urllib.request.urlopen(url)# 指定要抓取的网页url
        html = res.read().decode('utf-8')
        # print(html)
        txt('url1', html)
        soup = BeautifulSoup(html, 'lxml')
        get_text(soup)


for n in range(3):
    url = r'http://sousuo.gov.cn/s.htm?q=&n=10&p=' + str(n) + '&t=paper&advance=true&title=%E5%85%BB%E8%80%81&content=&puborg=&pcodeJiguan=&pcodeYear=&pcodeNum=&childtype=&subchildtype=&filetype=&timetype=timeqb&mintime=&maxtime=&sort=&sortType=1&nocorrect='  # 指定要抓取的网页url,必须以http开头
    readhtml(url)
#picpath + '\\url2.txt'
a=picpath + '\\url2.txt'
b=picpath + '\\result.txt'
c=picpath + '\\result.csv'
d=r'D:\python3.6.5\jieba\1.jpg'
e=picpath + '\\wordcloud.png'
FenCi(a,b,c,d,e)
print('finish')

5.关键词网络关系图生成。

import pandas as pd  
import numpy as np  
import codecs  
import networkx as nx  
import matplotlib.pyplot as plt  


a = []  
f = codecs.open(r'D:\python3.6.5\gongci\1.txt','r')  
line = f.readline()  
#print (line)  
i = 0  
A = []  
B = []  
while line!="":  
    a.append(line.split())   #保存文件是以空格分离的  
    #print (a[i][0],a[i][1])  
    A.append(a[i][0])  
    B.append(a[i][1])  
    i = i + 1
    print(i)
    line = f.readline()  
elem_dic = tuple(zip(A,B))   
#print (type(elem_dic))  
#print (list(elem_dic))
f.close()  
  
import matplotlib  
matplotlib.rcParams['font.sans-serif'] = ['SimHei']     
matplotlib.rcParams['font.family']='sans-serif'  
  
colors = ["red","green","blue","yellow"]  
G = nx.Graph()  
G.add_edges_from(list(elem_dic))  
pos=nx.random_layout(G)  #random;spring;circular
nx.draw_networkx_nodes(G, pos, alpha=0.2,node_size=1200,node_color=colors)  
nx.draw_networkx_edges(G, pos, node_color='r', alpha=0.3) #style='dashed'  
nx.draw_networkx_labels(G, pos, font_family='sans-serif', alpha=0.5) #font_size=5  
plt.show()

做关系图时要把生成的nxn共现矩阵转换为一个nx3的矩阵,如下图,然后将所有数取倒数,(取倒数前将所有0置为0.01),我认为两词距离越近那么他们之间值应该也越小,而上面得出的共现矩阵以加法形式进行计算。

python3.6.国家政策文本分析代码_第3张图片

以下是得出的结果,

python3.6.国家政策文本分析代码_第4张图片

 

你可能感兴趣的:(养老政策,神经网络,python,政策,TF/IDF,文本处理)