# 加载需要的包
import seaborn as sns #用于画图
from bs4 import BeautifulSoup #用于爬取arxiv的数据
import re #用于正则表达式,匹配字符串的模式
import requests #用于网络连接,发送网络请求,使用域名获取对应信息
import json #读取数据,我们的数据为json格式的
import pandas as pd #数据处理,数据分析
import matplotlib.pyplot as plt #画图工具
import os
os.getcwd()
'D:\\jupyter_notebook\\Github\\datawhale数据分析_学术前沿趋势分析\\AcademicTrends'
def readArxivFile(path, columns=['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi',
'report-no', 'categories', 'license', 'abstract', 'versions',
'update_date', 'authors_parsed'], count=None):
'''
定义读取文件的函数
path: 文件相对路径
columns: 需要选择的列--M默认使用数据集中所有列
count: 读取行数(原数据有17万+行)
'''
data = []
with open(path,"r") as f:
for idx,line in enumerate(f):
if idx == count: # 已经到了第count+1条数据--可以停止了,不放在data的list中
break # 跳出循环--停止读取数据
d = json.loads(line) # line是字典形式,loads之后也是字典形式
d = {
col:d[col] for col in columns} # 字典生成式,提取出想要的列&对应样本的值
data.append(d) # 把字典append到list中
data = pd.DataFrame(data)
return data
data = readArxivFile('./data/arxiv-metadata-oai-2019.json', columns=['id', 'authors', 'categories', 'authors_parsed'],
count = 100000) # 读取10万条数据
data
id | authors | categories | authors_parsed | |
---|---|---|---|---|
0 | 0704.0297 | Sung-Chul Yoon, Philipp Podsiadlowski and Step... | astro-ph | [[Yoon, Sung-Chul, ], [Podsiadlowski, Philipp,... |
1 | 0704.0342 | B. Dugmore and PP. Ntumba | math.AT | [[Dugmore, B., ], [Ntumba, PP., ]] |
2 | 0704.0360 | T.V. Zaqarashvili and K Murawski | astro-ph | [[Zaqarashvili, T. V., ], [Murawski, K, ]] |
3 | 0704.0525 | Sezgin Aygun, Ismail Tarhan, Husnu Baysal | gr-qc | [[Aygun, Sezgin, ], [Tarhan, Ismail, ], [Baysa... |
4 | 0704.0535 | Antonio Pipino (1,3), Thomas H. Puzia (2,4), a... | astro-ph | [[Pipino, Antonio, ], [Puzia, Thomas H., ], [M... |
... | ... | ... | ... | ... |
99995 | 1905.00812 | Zhiyi Huang, Xue Zhu | cs.DS | [[Huang, Zhiyi, ], [Zhu, Xue, ]] |
99996 | 1905.00814 | Tuomas P. Hyt\"onen | math.AP math.CV math.FA | [[Hytönen, Tuomas P., ]] |
99997 | 1905.00815 | Morteza Baniasad Azad and Behrooz Khosravi | math.GR | [[Azad, Morteza Baniasad, ], [Khosravi, Behroo... |
99998 | 1905.00816 | Ozgur Asar, Marie-Cecile Fournier, Etienne Dantan | stat.AP | [[Asar, Ozgur, ], [Fournier, Marie-Cecile, ], ... |
99999 | 1905.00818 | Patricia Schmidt, Tanja Hinderer | gr-qc astro-ph.HE | [[Schmidt, Patricia, ], [Hinderer, Tanja, ]] |
100000 rows × 4 columns
data.info() # 这里每一列都是字符串格式
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 100000 non-null object
1 authors 100000 non-null object
2 categories 100000 non-null object
3 authors_parsed 100000 non-null object
dtypes: object(4)
memory usage: 3.1+ MB
data.authors[0]
'Sung-Chul Yoon, Philipp Podsiadlowski and Stephan Rosswog'
data.authors_parsed[0]
[['Yoon', 'Sung-Chul', ''],
['Podsiadlowski', 'Philipp', ''],
['Rosswog', 'Stephan', '']]
# 选择类别为cs.CV下面的论文
data2 = data[data['categories'].apply(lambda x: 'cs.CV' in x)]
# 可以将匿名函数赋值给一个变量,再利用变量来调用这个函数
is_odd = lambda x: x%2==1 # return的结果是T/F
print(list(map(is_odd,range(1,20))))
[True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True]
list(filter(is_odd, range(1, 20))) # is_odd匿名函数 作用在range(1,20)上,filter的结果只保留返回值=True
[1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
data['categories']
0 astro-ph
1 math.AT
2 astro-ph
3 gr-qc
4 astro-ph
...
99995 cs.DS
99996 math.AP math.CV math.FA
99997 math.GR
99998 stat.AP
99999 gr-qc astro-ph.HE
Name: categories, Length: 100000, dtype: object
# 因为有的论文的categories可能有多个,所以不能直接=="cs.CV",要用in 查看是否包含这个string
data['categories'].apply(lambda x: 'cs.CV' in x) # apply这个匿名函数到categories这一列的每个string(lambda里面的参数x)上
0 False
1 False
2 False
3 False
4 False
...
99995 False
99996 False
99997 False
99998 False
99999 False
Name: categories, Length: 100000, dtype: bool
data2 = data[data['categories'].apply(lambda x: 'cs.CV' in x)]
data2 # 用布尔值,筛选出符合条件的行 i.e论文
id | authors | categories | authors_parsed | |
---|---|---|---|---|
531 | 0802.1412 | Mahesh Pal | cs.NE cs.CV | [[Pal, Mahesh, ]] |
1408 | 0905.1235 | Serguei A. Mokhov, Stephen Sinclair, Ian Cl\'e... | cs.SD cs.CL cs.CV cs.MM cs.NE | [[Mokhov, Serguei A., , for the MARF R&D Group... |
3231 | 1107.2875 | Chris Aholt, Bernd Sturmfels, Rekha Thomas | math.AG cs.CV | [[Aholt, Chris, ], [Sturmfels, Bernd, ], [Thom... |
4120 | 1203.0905 | Jos\'e I. Ronda, Antonio Vald\'es and Guillerm... | cs.CV | [[Ronda, José I., ], [Valdés, Antonio, ], [Gal... |
4378 | 1206.2627 | Tanaya Guha and Rabab K. Ward | cs.CV | [[Guha, Tanaya, ], [Ward, Rabab K., ]] |
... | ... | ... | ... | ... |
99943 | 1905.00742 | Georgios Kapidis and Ronald Poppe and Elsbeth ... | cs.CV | [[Kapidis, Georgios, ], [Poppe, Ronald, ], [va... |
99946 | 1905.00745 | Ahmed Mazari and Hichem Sahbi | cs.CV | [[Mazari, Ahmed, ], [Sahbi, Hichem, ]] |
99965 | 1905.00773 | Mariana-Iuliana Georgescu, Radu Tudor Ionescu | cs.CV cs.LG | [[Georgescu, Mariana-Iuliana, ], [Ionescu, Rad... |
99969 | 1905.00780 | Suraj Srinivas, Francois Fleuret | cs.LG cs.CV stat.ML | [[Srinivas, Suraj, ], [Fleuret, Francois, ]] |
99976 | 1905.00789 | Sheng Lin, Xiaolong Ma, Shaokai Ye, Geng Yuan,... | cs.LG cs.CV stat.ML | [[Lin, Sheng, ], [Ma, Xiaolong, ], [Ye, Shaoka... |
5167 rows × 4 columns
# 拼接所有作者
all_authors = sum(data2['authors_parsed'], [])
data2['authors_parsed'][1408]
[['Mokhov', 'Serguei A.', '', 'for the MARF R&D Group'],
['Sinclair', 'Stephen', '', 'for the MARF R&D Group'],
['Clément', 'Ian', '', 'for the MARF R&D Group'],
['Nicolacopoulos', 'Dimitrios', '', 'for the MARF R&D Group']]
# 方法一:列表生成式
[s for l in data2['authors_parsed'][1408] for s in l]
['Mokhov',
'Serguei A.',
'',
'for the MARF R&D Group',
'Sinclair',
'Stephen',
'',
'for the MARF R&D Group',
'Clément',
'Ian',
'',
'for the MARF R&D Group',
'Nicolacopoulos',
'Dimitrios',
'',
'for the MARF R&D Group']
# 方法二:用iterable(二维嵌套列表)和空列表进行sum
sum(data2['authors_parsed'][1408],[])
# sum(iterable, /, start=0)
# Docstring:
# Return the sum of a 'start' value (default: 0) plus an iterable of numbers
# 将空列表[]和iterable可迭代对象中的每个元素:1D列表进行相加
['Mokhov',
'Serguei A.',
'',
'for the MARF R&D Group',
'Sinclair',
'Stephen',
'',
'for the MARF R&D Group',
'Clément',
'Ian',
'',
'for the MARF R&D Group',
'Nicolacopoulos',
'Dimitrios',
'',
'for the MARF R&D Group']
# sum(iterable of lists,[])的过程如下:
[] + ['Mokhov', 'Serguei A.', '', 'for the MARF R&D Group'] \
+ ['Sinclair', 'Stephen', '', 'for the MARF R&D Group'] \
+ ['Clément', 'Ian', '', 'for the MARF R&D Group'] \
+ ['Nicolacopoulos', 'Dimitrios', '', 'for the MARF R&D Group']
['Mokhov',
'Serguei A.',
'',
'for the MARF R&D Group',
'Sinclair',
'Stephen',
'',
'for the MARF R&D Group',
'Clément',
'Ian',
'',
'for the MARF R&D Group',
'Nicolacopoulos',
'Dimitrios',
'',
'for the MARF R&D Group']
data2['authors_parsed'] # 将series中的每个values拼接起来,形成一个大的二维list
531 [[Pal, Mahesh, ]]
1408 [[Mokhov, Serguei A., , for the MARF R&D Group...
3231 [[Aholt, Chris, ], [Sturmfels, Bernd, ], [Thom...
4120 [[Ronda, José I., ], [Valdés, Antonio, ], [Gal...
4378 [[Guha, Tanaya, ], [Ward, Rabab K., ]]
...
99943 [[Kapidis, Georgios, ], [Poppe, Ronald, ], [va...
99946 [[Mazari, Ahmed, ], [Sahbi, Hichem, ]]
99965 [[Georgescu, Mariana-Iuliana, ], [Ionescu, Rad...
99969 [[Srinivas, Suraj, ], [Fleuret, Francois, ]]
99976 [[Lin, Sheng, ], [Ma, Xiaolong, ], [Ye, Shaoka...
Name: authors_parsed, Length: 5167, dtype: object
# 方法一:列表生成式
[j for i in data2['authors_parsed'] for j in i][:10]
[['Pal', 'Mahesh', ''],
['Mokhov', 'Serguei A.', '', 'for the MARF R&D Group'],
['Sinclair', 'Stephen', '', 'for the MARF R&D Group'],
['Clément', 'Ian', '', 'for the MARF R&D Group'],
['Nicolacopoulos', 'Dimitrios', '', 'for the MARF R&D Group'],
['Aholt', 'Chris', ''],
['Sturmfels', 'Bernd', ''],
['Thomas', 'Rekha', ''],
['Ronda', 'José I.', ''],
['Valdés', 'Antonio', '']]
# 方法二:用sum(iterable,[])
# 其中iterable:是series的形式,values仍然是二维list
sum(data2['authors_parsed'], [])[:10]
[['Pal', 'Mahesh', ''],
['Mokhov', 'Serguei A.', '', 'for the MARF R&D Group'],
['Sinclair', 'Stephen', '', 'for the MARF R&D Group'],
['Clément', 'Ian', '', 'for the MARF R&D Group'],
['Nicolacopoulos', 'Dimitrios', '', 'for the MARF R&D Group'],
['Aholt', 'Chris', ''],
['Sturmfels', 'Bernd', ''],
['Thomas', 'Rekha', ''],
['Ronda', 'José I.', ''],
['Valdés', 'Antonio', '']]
all_authors[:10]
[['Pal', 'Mahesh', ''],
['Mokhov', 'Serguei A.', '', 'for the MARF R&D Group'],
['Sinclair', 'Stephen', '', 'for the MARF R&D Group'],
['Clément', 'Ian', '', 'for the MARF R&D Group'],
['Nicolacopoulos', 'Dimitrios', '', 'for the MARF R&D Group'],
['Aholt', 'Chris', ''],
['Sturmfels', 'Bernd', ''],
['Thomas', 'Rekha', ''],
['Ronda', 'José I.', ''],
['Valdés', 'Antonio', '']]
for x in all_authors[:10]:
print(x)
['Pal', 'Mahesh', '']
['Mokhov', 'Serguei A.', '', 'for the MARF R&D Group']
['Sinclair', 'Stephen', '', 'for the MARF R&D Group']
['Clément', 'Ian', '', 'for the MARF R&D Group']
['Nicolacopoulos', 'Dimitrios', '', 'for the MARF R&D Group']
['Aholt', 'Chris', '']
['Sturmfels', 'Bernd', '']
['Thomas', 'Rekha', '']
['Ronda', 'José I.', '']
['Valdés', 'Antonio', '']
" ".join(['Pal', 'Mahesh', '']) # " ".join(iterable)--iterable是个列表
# 将iterable的每个元素用空格" "进行拼接
'Pal Mahesh '
[' '.join(x) for x in all_authors[:10]]
# all_authors是个嵌套的2D列表--每个元素是个list
# x:all_authors这个iterable中的每个元素--list
# 对于x这个iterable中的每个元素--string--用" "进行拼接
# 列表生成式,每个元素是作者名in string--每个部分用空格分隔形成的string
['Pal Mahesh ',
'Mokhov Serguei A. for the MARF R&D Group',
'Sinclair Stephen for the MARF R&D Group',
'Clément Ian for the MARF R&D Group',
'Nicolacopoulos Dimitrios for the MARF R&D Group',
'Aholt Chris ',
'Sturmfels Bernd ',
'Thomas Rekha ',
'Ronda José I. ',
'Valdés Antonio ']
authors_names = [' '.join(x) for x in all_authors]
authors_names = pd.DataFrame(authors_names,columns=["names"]) # list转换成DF
authors_names
names | |
---|---|
0 | Pal Mahesh |
1 | Mokhov Serguei A. for the MARF R&D Group |
2 | Sinclair Stephen for the MARF R&D Group |
3 | Clément Ian for the MARF R&D Group |
4 | Nicolacopoulos Dimitrios for the MARF R&D Group |
... | ... |
23122 | Ma Xiaolong |
23123 | Ye Shaokai |
23124 | Yuan Geng |
23125 | Ma Kaisheng |
23126 | Wang Yanzhi |
23127 rows × 1 columns
authors_names["names"].value_counts() # 去重&计数
# 统计每个unique的名字出现的次数
Tao Dacheng 50
Van Gool Luc 27
Liu Wei 24
Zhang Lei 22
Darrell Trevor 21
..
Ding Shouhong 1
Li Chun-Guang 1
Eitel Fabian 1
Zhu Qikui 1
Wu Bojian 1
Name: names, Length: 15131, dtype: int64
authors_names["names"].value_counts().head(10) # 发表论文数top10的作者姓名
Tao Dacheng 50
Van Gool Luc 27
Liu Wei 24
Zhang Lei 22
Darrell Trevor 21
Wang Xiaogang 21
Navab Nassir 21
Reid Ian 18
Zafeiriou Stefanos 17
Torr Philip H. S. 17
Name: names, dtype: int64
# 画图
plt.figure(figsize=(10,6)) # 创建画布
authors_names["names"].value_counts().head(10).plot(kind="barh");
# 修改图配置
names = authors_names["names"].value_counts().index[:10]
_ = plt.yticks(range(0, len(names)), names) # yticks中值 不用写成index.values[:10]的形式
plt.ylabel('Author')
plt.xlabel('Count')
authors_names["names"].value_counts().head(10)
Tao Dacheng 50
Van Gool Luc 27
Liu Wei 24
Zhang Lei 22
Darrell Trevor 21
Wang Xiaogang 21
Navab Nassir 21
Reid Ian 18
Zafeiriou Stefanos 17
Torr Philip H. S. 17
Name: names, dtype: int64
authors_names["names"].value_counts().index[:10]
# value_counts()的结果是个series
# series.index是所有unique的作者姓名--结果是Index(...)
# 这里取前10
Index(['Tao Dacheng ', 'Van Gool Luc ', 'Liu Wei ', 'Zhang Lei ',
'Darrell Trevor ', 'Wang Xiaogang ', 'Navab Nassir ', 'Reid Ian ',
'Parikh Devi ', 'Davis Larry S. '],
dtype='object')
authors_names["names"].value_counts().index.values
# series.index.values--结果是array(...)
array(['Tao Dacheng ', 'Van Gool Luc ', 'Liu Wei ', ..., 'Galal Sameh ',
'Davis L. Taylor ', 'Jubair Mohammad Imrul '], dtype=object)
authors_parsed
字段中作者第一个单词:# all_authors中每个列表的第一个string
[x[0] for x in all_authors][:10] # all_authors这个可迭代对象中的每个元素x: 一维列表 # 按顺序显示前10个
# 从x取出第一个元素string--last name
['Pal',
'Mokhov',
'Sinclair',
'Clément',
'Nicolacopoulos',
'Aholt',
'Sturmfels',
'Thomas',
'Ronda',
'Valdés']
authors_lastnames = [x[0] for x in all_authors]
authors_lastnames = pd.DataFrame(authors_lastnames,columns=["last_names"]) # 列表变DF
authors_lastnames
last_names | |
---|---|
0 | Pal |
1 | Mokhov |
2 | Sinclair |
3 | Clément |
4 | Nicolacopoulos |
... | ... |
23122 | Ma |
23123 | Ye |
23124 | Yuan |
23125 | Ma |
23126 | Wang |
23127 rows × 1 columns
# 画图
plt.figure(figsize=(10, 6))
authors_lastnames["last_names"].value_counts().head(10).plot(kind="barh")
names = authors_lastnames["last_names"].value_counts().index.values[:10]
_ = plt.yticks(range(0, len(names)), names)
plt.ylabel('Author Last_Names')
plt.xlabel('Count')
Text(0.5, 0, 'Count')
all_authors[0] # i--i.e二维嵌套列表中的每个元素--1D list
['Pal', 'Mahesh', '']
all_authors[0][0] # last name
'Pal'
all_authors[0][0][0] # last name的第一个字符
'P'
[i[0][0] for i in all_authors ][:10]
['P', 'M', 'S', 'C', 'N', 'A', 'S', 'T', 'R', 'V']
authors_lastnames_first = [i[0][0] for i in all_authors ]
authors_lastnames_first = pd.DataFrame(authors_lastnames_first,columns=["last_names_first_character"]) # 列表变DF
authors_lastnames_first
last_names_first_character | |
---|---|
0 | P |
1 | M |
2 | S |
3 | C |
4 | N |
... | ... |
23122 | M |
23123 | Y |
23124 | Y |
23125 | M |
23126 | W |
23127 rows × 1 columns
# 画图
plt.figure(figsize=(10, 6))
authors_lastnames_first["last_names_first_character"].value_counts().head(10).plot(kind='barh')
names = authors_lastnames_first["last_names_first_character"].value_counts().index[:10]
_ = plt.yticks(range(0, len(names)), names)
plt.ylabel('Author Last_Names_First_Character')
plt.xlabel('Count')
plt.show()
# %load Task2 论文作者统计.py
#!/usr/bin/env python
# ## 任务说明
#
# - 任务主题:论文作者统计,统计所有论文作者出现频率Top10的姓名;
# - 任务内容:论文作者的统计、使用 **Pandas** 读取数据并使用字符串操作;
# - 任务成果:学习 **Pandas** 的字符串操作;
# ## 数据处理步骤
#
# 在原始arxiv数据集中论文作者`authors`字段是一个字符串格式,其中每个作者使用逗号进行分隔分,所以我们我们首先需要完成以下步骤:
#
# - 使用逗号对作者进行切分;
# - 剔除单个作者中非常规的字符;
#
# 具体操作可以参考以下例子:
# ```
# C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan
#
# # 切分为,其中\\为转义符
#
# C. Ba'lazs
# E. L. Berger
# P. M. Nadolsky
# C.-P. Yuan
# ```
# 当然在原始数据集中`authors_parsed`字段已经帮我们处理好了作者信息,可以直接使用该字段完成后续统计。
# ## 字符串处理
#
# 在Python中字符串是最常用的数据类型,可以使用引号('或")来创建字符串。Python中所有的字符都使用字符串存储,可以使用方括号来截取字符串,如下实例:
# In[1]:
var1 = 'Hello Datawhale!'
var2 = "Python Everwhere!"
print("var1[-10:]: ", var1[-10:])
print("var2[2:7]: ", var2[0:7])
# 同时在Python中还支持转义符:
#
# | \(在行尾时) | 续行符 |
# | ----------- | ---------- |
# | \\ | 反斜杠符号 |
# | \' | 单引号 |
# | \" | 双引号 |
# | \n | 换行 |
# | \t | 横向制表符 |
# | \r | 回车 |
#
# Python中还内置了很多内置函数,非常方便使用:
#
# | **方法** | **描述** |
# | :------------------ | :----------------------------------------------------------- |
# | string.capitalize() | 把字符串的第一个字符大写 |
# | string.isalpha() | 如果 string 至少有一个字符并且所有字符都是字母则返回 True,否则返回 False |
# | string.title() | 返回"标题化"的 string,就是说所有单词都是以大写开始,其余字母均为小写(见 istitle()) |
# | string.upper() | 转换 string 中的小写字母为大写 |
#
# ## 具体代码实现以及讲解
# ### 数据读取
# In[1]:
# 导入所需的package
import seaborn as sns #用于画图
from bs4 import BeautifulSoup #用于爬取arxiv的数据
import re #用于正则表达式,匹配字符串的模式
import requests #用于网络连接,发送网络请求,使用域名获取对应信息
import json #读取数据,我们的数据为json格式的
import pandas as pd #数据处理,数据分析
import matplotlib.pyplot as plt #画图工具
# In[9]:
def readArxivFile(path, columns=['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi',
'report-no', 'categories', 'license', 'abstract', 'versions',
'update_date', 'authors_parsed'], count=None):
'''
定义读取文件的函数
path: 文件路径
columns: 需要选择的列
count: 读取行数
'''
data = []
with open(path, 'r') as f:
for idx, line in enumerate(f):
if idx == count:
break
d = json.loads(line)
d = {
col : d[col] for col in columns}
data.append(d)
data = pd.DataFrame(data)
return data
data = readArxivFile('arxiv-metadata-oai-snapshot.json',
['id', 'authors', 'categories', 'authors_parsed'],
100000)
# 为了方便处理数据,我们只选择了三个字段进行读取。
# ### 数据统计
#
# 接下来我们将完成以下统计操作:
#
# - 统计所有作者姓名出现频率的Top10;
# - 统计所有作者姓(姓名最后一个单词)的出现频率的Top10;
# - 统计所有作者姓第一个字符的评率;
#
# 为了节约计算时间,下面选择部分类别下的论文进行处理:
# In[10]:
# 选择类别为cs.CV下面的论文
data2 = data[data['categories'].apply(lambda x: 'cs.CV' in x)]
# 拼接所有作者
all_authors = sum(data2['authors_parsed'], [])
# 处理完成后`all_authors`变成了所有一个list,其中每个元素为一个作者的姓名。我们首先来完成姓名频率的统计。
# In[11]:
# 拼接所有的作者
authors_names = [' '.join(x) for x in all_authors]
authors_names = pd.DataFrame(authors_names)
# 根据作者频率绘制直方图
plt.figure(figsize=(10, 6))
authors_names[0].value_counts().head(10).plot(kind='barh')
# 修改图配置
names = authors_names[0].value_counts().index.values[:10]
_ = plt.yticks(range(0, len(names)), names)
plt.ylabel('Author')
plt.xlabel('Count')
# 接下来统计姓名姓,也就是`authors_parsed`字段中作者第一个单词:
# In[12]:
authors_lastnames = [x[0] for x in all_authors]
authors_lastnames = pd.DataFrame(authors_lastnames)
plt.figure(figsize=(10, 6))
authors_lastnames[0].value_counts().head(10).plot(kind='barh')
names = authors_lastnames[0].value_counts().index.values[:10]
_ = plt.yticks(range(0, len(names)), names)
plt.ylabel('Author')
plt.xlabel('Count')
# 绘制得到的结果,从结果看出这些都是华人或者中国姓氏~
#
#
# 统计所有作者姓第一个字符的评率,这个流程与上述的类似,同学们可以自行尝试。
#