话不多说先上代码:
这个代码是通过tweepy来抓取twitter数据的代码
数据同时存入Json文本和mongodb数据库
# import
from __future__ import print_function
import tweepy
import json
from pymongo import MongoClient
# filter words
WORDS = ["Chelsea", "FIFA MOBILE", "premier league"]
# specify the coordination in Glasgow
GEO = [-4.55, 55.7, -4, 56]
MONGO_HOST = 'mongodb://localhost:27017/twitter_database' # mongodb host path
FILE_NAME = "tweets_search_data.json" # file name to save
# get key from twitter developer
CONSUMER_KEY = "nd7hMoeF7J1RB7C8VZ3lhCJzK"
CONSUMER_SECRET = "VCExyTGAwqWkeGdDOJplnLTCoTTZ0AKKNUr1CQiptswP8ahTZ4"
ACCESS_TOKEN = "1051580667304337408-fQSLqomYg6Bz1LiFvWMva5IuOD5tqz"
ACCESS_TOKEN_SECRET = "b9oOb59R9nPea39iodulASptxjQwXh9N1NWSo5ivvpld4"
numS = 0
class StreamListener(tweepy.StreamListener):
# This is a class provided by tweepy to access the Twitter Streaming API.
def on_connect(self):
# if connect the streamer will print something
print("You are now connected to the streaming API.")
def on_error(self, status_code):
# On error - if an error occurs, display the error / status code
print('An Error has occured: ' + repr(status_code))
return False
def on_data(self, data):
# When receiving data from twitter will call this method
try:
print(data)
with open(FILE_NAME, 'a') as tf: # write data to file
tf.write(data)
client = MongoClient(MONGO_HOST) # connect mongodb
db = client.twitter_database # create db
data_json = json.loads(data) # Decode the JSON from Twitter
'''
created_at = data_json['created_at']
print("Tweet collected time is " + str(created_at))
'''
db.twitterdb_collection.insert(data_json) # insert the data into the mongodb into a collection
except Exception as e:
print(e)
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
# Set up the listener. The 'wait_on_rate_limit=True' is needed to help with Twitter API rate limiting.
listener = StreamListener(api=tweepy.API(wait_on_rate_limit=True))
streamer = tweepy.Stream(auth=auth, listener=listener)
# print("Searching keywords are: " + str(WORDS)) # filter keywords
# streamer.filter(track=WORDS)
print("Searching geo-tagged: " + str(GEO))
streamer.filter(locations=GEO)
以下代码是抓取到的数据
读取文本数据并进行数据处理画图
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates as mdates
df = pd.read_json('./tweets_data_10min.json', lines=True, encoding='utf-8')
# print (df.columns.values) # print all columns value
# print(list(df))
df_choose = df[['created_at', 'id', 'place', 'retweeted_status', 'quoted_status']] # choose specific keyword and save to another df
# print (df_choose)
def draw_chart(num_list, title): # draw chart
name_list = [1,2,3,4,5,6,7,8,9,10] # name of x axis
plt.bar(range(len(name_list)), num_list, color='rgb', tick_label=name_list)
plt.xlabel("minutes")
plt.ylabel("count")
plt.title(title)
plt.show()
def count_according_time(df_temp, title): # according to input df to draw the chart by 10 minutes
number_temp = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
# date = pd.to_datetime('2018-11-07 19:32:30')
date = df_temp.iat[0, 0] # get first line time
# print ("start time is ", date)
# c = df_temp['created_at'].between(date, date + pd.offsets.DateOffset(minutes=1),).sum()
for i in range(0, 10):
c = df_temp['created_at'].between(date + pd.offsets.DateOffset(minutes=i), date + pd.offsets.DateOffset(minutes=i+1),).sum()
number_temp[i] = c
draw_chart(number_temp, title)
# draw the total count chart
count_according_time(df_choose, 'Total Count')
# draw the geo-tagged count chart
df_geo = df_choose[df_choose['place'].notnull()] # choose not null row
count_according_time(df_geo, 'geo-tagged from Glasgow Count')
# draw the quote chart
df_quoted = df_choose[df_choose['quoted_status'].notnull()] # choose not null row to new df
count_according_time(df_quoted, 'Quoted Tweets Count')
# draw the retweet count chart
df_retweeted = df_choose[df_choose['retweeted_status'].notnull()]
count_according_time(df_retweeted, 'Retweets Count')