抓取twitter数据存入mongodb并画图

话不多说先上代码:
这个代码是通过tweepy来抓取twitter数据的代码
数据同时存入Json文本和mongodb数据库

# import
from __future__ import print_function
import tweepy
import json
from pymongo import MongoClient

# filter words
WORDS = ["Chelsea", "FIFA MOBILE", "premier league"]
# specify the coordination in Glasgow
GEO = [-4.55, 55.7, -4, 56]


MONGO_HOST = 'mongodb://localhost:27017/twitter_database'  # mongodb host path
FILE_NAME = "tweets_search_data.json"  # file name to save

# get key from twitter developer
CONSUMER_KEY = "nd7hMoeF7J1RB7C8VZ3lhCJzK"
CONSUMER_SECRET = "VCExyTGAwqWkeGdDOJplnLTCoTTZ0AKKNUr1CQiptswP8ahTZ4"
ACCESS_TOKEN = "1051580667304337408-fQSLqomYg6Bz1LiFvWMva5IuOD5tqz"
ACCESS_TOKEN_SECRET = "b9oOb59R9nPea39iodulASptxjQwXh9N1NWSo5ivvpld4"

numS = 0


class StreamListener(tweepy.StreamListener):
    # This is a class provided by tweepy to access the Twitter Streaming API.

    def on_connect(self):
        # if connect the streamer will print something
        print("You are now connected to the streaming API.")

    def on_error(self, status_code):
        # On error - if an error occurs, display the error / status code
        print('An Error has occured: ' + repr(status_code))
        return False

    def on_data(self, data):
        # When receiving data from twitter will call this method
        try:
            print(data)

            with open(FILE_NAME, 'a') as tf:  # write data to file
                tf.write(data)

            client = MongoClient(MONGO_HOST)  # connect mongodb
            db = client.twitter_database  # create db
            data_json = json.loads(data)  # Decode the JSON from Twitter

            '''
            created_at = data_json['created_at']
            print("Tweet collected time is  " + str(created_at))
            '''
            db.twitterdb_collection.insert(data_json)  # insert the data into the mongodb into a collection

        except Exception as e:
            print(e)


auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
# Set up the listener. The 'wait_on_rate_limit=True' is needed to help with Twitter API rate limiting.
listener = StreamListener(api=tweepy.API(wait_on_rate_limit=True))
streamer = tweepy.Stream(auth=auth, listener=listener)

# print("Searching keywords are: " + str(WORDS))  # filter keywords
# streamer.filter(track=WORDS)
print("Searching geo-tagged: " + str(GEO))
streamer.filter(locations=GEO)

以下代码是抓取到的数据
读取文本数据并进行数据处理画图

import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates as mdates


df = pd.read_json('./tweets_data_10min.json', lines=True, encoding='utf-8')

# print (df.columns.values)  # print all columns value
# print(list(df))

df_choose = df[['created_at', 'id', 'place', 'retweeted_status', 'quoted_status']]  # choose specific keyword and save to another df
# print (df_choose)


def draw_chart(num_list, title):  # draw chart
    name_list = [1,2,3,4,5,6,7,8,9,10] # name of x axis
    plt.bar(range(len(name_list)), num_list, color='rgb', tick_label=name_list)
    plt.xlabel("minutes")
    plt.ylabel("count")
    plt.title(title)
    plt.show()


def count_according_time(df_temp, title):  # according to input df to draw the chart by 10 minutes
    number_temp = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    # date = pd.to_datetime('2018-11-07 19:32:30')
    date = df_temp.iat[0, 0]  # get first line time
    # print ("start time is ", date)

    # c = df_temp['created_at'].between(date, date + pd.offsets.DateOffset(minutes=1),).sum()
    for i in range(0, 10):
        c = df_temp['created_at'].between(date + pd.offsets.DateOffset(minutes=i), date + pd.offsets.DateOffset(minutes=i+1),).sum()
        number_temp[i] = c

    draw_chart(number_temp, title)


# draw the total count chart
count_according_time(df_choose, 'Total Count')
# draw the geo-tagged count chart
df_geo = df_choose[df_choose['place'].notnull()]  # choose not null row
count_according_time(df_geo, 'geo-tagged from Glasgow Count')

# draw the quote chart
df_quoted = df_choose[df_choose['quoted_status'].notnull()]  # choose not null row to new df
count_according_time(df_quoted, 'Quoted Tweets Count')
# draw the retweet count chart
df_retweeted = df_choose[df_choose['retweeted_status'].notnull()]
count_according_time(df_retweeted, 'Retweets Count')

你可能感兴趣的:(Python)