PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。Google把从A页面到B页面的链接解释为A页面给B页面投票,Google根据投票来源(甚至来源的来源,即链接到A页面的页面)和投票目标的等级来决定新的等级。简单的说,一个高等级的页面可以使其他低等级页面的等级提升。
二、pagerank的多种形态
1、普通型
给定几个网页,每个网页上都有入链和出链,那么该组合是正常的,比如下面这个矩阵M
array([[ 0., 1., 1., 1., 0.],
[ 0., 0., 0., 1., 1.],
[ 0., 0., 0., 0., 1.],
[ 0., 0., 0., 0., 1.],
[ 1., 0., 0., 0., 0.]])
横行表示出链,比如M(1,3)表示有从1网页有一条链接通往3网页,在图中也可以看得出来。
#coding=utf8
from numpy import *
matrix = array([[0,1,1,1,0],
[0,0,0,1,1],
[0,0,0,0,1],
[0,0,0,0,1],
[1,0,0,0,0]],dtype = float) #dtype指定为float
#matrix = array([[0,0,0,0],
# [1,0,0,1],
# [1,1,0,0],
# [1,1,1,0]],dtype = float) #dtype指定为float
def in_source(martix,id):
return matrix[:,id]
def graphMove(matrix):
c = zeros((matrix.shape),dtype = float)
for i in range(matrix.shape[0]):
if matrix[i].sum()==0:
matrix[i]=1
for j in range(matrix.shape[1]):
c[i][j] = matrix[i][j] / (matrix[i].sum())
return c
from copy import deepcopy
class PRIterator:
__doc__ = '''计算一张图中的PR值'''
def __init__(self):
self.alpha = 0.85 # α
self.max_iterations = 100 # 最大迭代次数
self.min_delta = 0.00001 #阈值
def page_rank(self,matrix):
N = matrix.shape[0]
nodes=[i for i in range(N)]#节点列表
page_rank = dict.fromkeys(nodes, 1.0 / N) # 给每个节点赋予初始的PR值
page_rank_last = dict.fromkeys(nodes, 1.0 / N) # 用于下一个PR值的更新
damping_value = (1.0 - self.alpha) / N # 公式中的(1−α)/N部分
trans_matrix=graphMove(matrix)
flag = False
for i in range(self.max_iterations):
change = 0
for node in nodes:
rank = 0
for id,weight in enumerate(in_source(matrix,node)):
if weight!=0:
rank += self.alpha * page_rank[id]*trans_matrix[id][node]
rank += damping_value
page_rank_last[node] = rank
change = sum([abs(page_rank[n] - page_rank_last[n]) for n in page_rank])
page_rank = deepcopy(page_rank_last)
print("This is NO.%s iteration" % (i + 1))
print(page_rank)
if change < self.min_delta:
flag = True
break
if flag:
print("finished in %s iterations!" % (i+1))
else:
print("finished out of 100 iterations!")
return page_rank
from numpy import *
a = array([[0,1,1,1,0],
[0,0,0,1,1],
[0,0,0,0,1],
[0,0,0,0,1],
[1,0,0,0,0]],dtype = float) #dtype指定为float
def graphMove(matrix):
c = zeros((matrix.shape),dtype = float)
for i in range(matrix.shape[0]):
if matrix[i].sum()==0:
matrix[i]=1
for j in range(matrix.shape[1]):
c[i][j] = matrix[i][j] / (matrix[i].sum())
return c
def firstPr(c):
pr = zeros(1*c.shape[0],dtype = float) #构造一个存放pr值得矩阵
for i in range(c.shape[0]):
pr[i] = float(1)/c.shape[0]
return pr
T=1.0e-6
def pageRank(p,m,v): #计算pageRank值
id=0
n=firstPr(m) #用于(1-p)*n计算
while(sum([abs((p*dot(v,m) + (1-p)*n)[i]-v[i]) for i in range(len(v))])>T): #判断pr矩阵是否收敛
v = p*dot(v,m) + (1-p)*n
id+=1
# print (v)
return v,id
if __name__=="__main__":
M = graphMove(matrix)
pr = firstPr(M)
p = 0.85
print (pageRank(p,M,pr)) # 计算pr值