对节目微博进行强过滤之后的处理

1,对原始数据.data进行过滤,利用java实现

package com.bobo.DataPre;



import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.File;

import java.io.FileReader;

import java.io.FileWriter;

import java.io.IOException;

import java.io.PrintWriter;



import com.bobo.util.Constants;

import com.bobo.util.Fenci;

import com.bobo.util.StopwordsRemover;

import com.bobo.util.StringUtil;

import com.bobo.util.UtilityForRemoveAtName;



public class ProgramDataFilter {



    /**

     * @param args

     */

    public static void main(String[] args) {

        long start = System.currentTimeMillis();

        for (int i = 0; i < Constants.ProgramNameList.length; i++) {



        }

        ProgramDataFilter pre = new ProgramDataFilter();

        String inFilePath;

        String outFilePath;

        String programName;

        String[] keywords;

        for (int i = 0; i < Constants.ProgramNameList.length; i++) {

            programName = Constants.ProgramNameList[i];

            keywords = Constants.keywordsList[i];

            inFilePath = Constants.TitleDir + File.separator + programName

                    + ".title.uniqByWeiboId";

            outFilePath = Constants.FilterDir + File.separator + programName

                    + ".filter.fenci";

            pre.dataSetAndRmStop(inFilePath, outFilePath, programName, keywords);

            long end = System.currentTimeMillis();

            System.out.println(programName + "数据预处理,分词、去处停用时、去除@花费的时间为:"

                    + (end - start) / 1000);

        }



    }



    public boolean isRelative(String weiboText, String programName,

            String[] filterWords) {

        // 包含节目名称

        if (!weiboText.contains(programName)) {

            return false;

        }

        // 对于歧义性小的,单独利用名字就够了

        if (filterWords.length < 1) {

            return true;

        }



        if (weiboText.contains("" + programName + "")) {

            return true;

        }



        // 包含节目名称的同时,包含演员名称或者节目类别



        for (String keyword : filterWords) {

            if (weiboText.contains(keyword)) {

                return true;

            }

        }



        return false;

    }



    // 第一步,进行分词、去除停用词、去除@后的用户名称?

    private void dataSetAndRmStop(String inFilePath, String outFilePath,

            String programName, String[] keywords) {

        FileReader fr = null;

        BufferedReader br = null;

        FileWriter fw = null;

        BufferedWriter bw = null;

        PrintWriter pw = null;

        String line = null;

        Fenci fenci = new Fenci();

        fenci.initial();

        StopwordsRemover stop = new StopwordsRemover();

        stop.setStoppingListSet(stop

                .loadStoppingListSet("./conf/stopwords.list"));

        String weiboText;



        try {

            fr = new FileReader(inFilePath);

            br = new BufferedReader(fr);

            fw = new FileWriter(outFilePath);

            bw = new BufferedWriter(fw);

            pw = new PrintWriter(bw);



            while ((line = br.readLine()) != null) {

                String[] lineArr = line.split("\t");

                if (lineArr.length != 3) {

                    continue;

                }

                weiboText = lineArr[1];

                if (StringUtil.isNullOrEmpty(weiboText)) {

                    continue;

                }

                if (!isRelative(weiboText, programName, keywords)) {

                    continue;

                }



                String fenciString = stop.removeStoppingWords(fenci

                        .testICTCLAS_ParagraphProcess((UtilityForRemoveAtName

                                .removeName(weiboText))));

                if (!StringUtil.isNullOrEmpty(fenciString)) {

                    pw.println(lineArr[0]+"\t"+fenciString);

                }



            }



        } catch (Exception e) {

            e.printStackTrace();

            System.out.println("RemoveUrlUtil.java文件去除链接出现异常");

        } finally {

            try {

                br.close();

            } catch (IOException e) {

                // TODO Auto-generated catch block

                e.printStackTrace();

            }

            pw.close();

        }



    }



}
过滤和分词

生成的文件格式是

用户id “\t” 微博文本

2,提取每个节目下的用户列表,并将用户的列表和用户的profile进行对应 

#!/bin/bash



data_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/data

user_file=/home/minelab/liweibo/springNightForLargePaper/springNightUser/sina_user.data

dest_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/userForProgram



program_list=`ls $data_dir |  awk -F'.' '{print $1}'`



for program in $program_list

do

    rm -rf $dest_dir/"$program"_userid_times_profile.map

    cat $data_dir/"$program".filter.fenci | awk -F'\t' '{print $1}' | sort | uniq -c | sort -r -n |sed 's/^ *//g' | sed 's/ /\t/g' | awk -F'\t' '{print $2"\t"$1}' | sort > $dest_dir/"$program"_userid_times.map 

    join -t $'\t'  $dest_dir/"$program"_userid_times.map $user_file > $dest_dir/"$program"_userid_times_profile.map

    rm -rf $dest_dir/"$program"_userid_times.map

    echo $program is done!

done

    echo done!
提取每个节目的用户id列表和用户profile

 3,构建节目id"\t"评论该节目的用户数“\t”评论该节目的用户列表 

#!/bin/bash



#生成文件格式节目id"    "评论该节目的用户数目"  "评论该节目的用户id列表(不同id之间使用空格分开)

#如果一个用户多次评论某个节目,当作一次进行处理

program_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/userForProgram

inter_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/inter_data

result_file=$inter_dir/programid_userlist.map

tmp_file=$inter_dir/programid_userlist.tmp

program_list=`ls $program_dir`



rm -rf $result_file

rm -rf $tmp_file

i=1

for program in $program_list

do

    user_list=`cat $program_dir/$program |  awk -F'\t' '{printf("%s ",$1);}End{print;}'`

    line_num=`cat $program_dir/$program | wc -l | awk '{print $1}'`

    echo $i"    "$line_num" "$user_list>>$tmp_file

    i=$[$i+1]

done





#根据节目的流程度排序

cat $tmp_file | sort -t $'\t' -k 2 -r -n > $result_file

rm -rf $tmp_file

echo "done"
构架节目—用户矩阵

 4,构建用户id“\t”该用户评论的节目数目"\t"该用户评论的节目列表

#!/usr/bin/python

import sys 



def main():

    inputfile  = "programid_userlist.map"

    outputfile = "tmp"



    fin = open(inputfile, 'r')

    fout = open(outputfile, 'w')

    user_program = {}

    for line in fin:

        fields = line.strip().split('\t')

        program_id = fields[0]

        userids = fields[2].split(' ')

        for userid in userids:

            if userid not in user_program:

                user_program[userid] = set()

            user_program[userid].add(program_id)

    fin.close()



    for userid in user_program:

        print>>fout, "%s\t%s\t%s"%(userid, len(user_program[userid]), ' '.join(user_program[userid]))

    fout.close()





if __name__ == "__main__":

     main()
构建用户——节目矩阵

 5,简单根据词频统计每个节目下的热门词汇

#!/bin/sh



#根据词频统计每个节目下的热门词汇1000

data_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/data

result_dir=/home/minelab/liweibo/springNightForLargePaper/second_test/topWords

file_list=`ls $data_dir | awk -F'.' '{print $1}'`



for file in $file_list

do

    cat $data_dir/$file".filter.fenci" | awk -F'\t' '{print $2}' | sed 's/^ //g' | sed 's/ /\n/g' | sort | uniq -c | sort -r -n | sed 's/^ *//g' | awk '{print $2"\t"$1}' > $result_dir/$file.topwords

    echo $file is done!

done

echo done!
topwords

6,根据出现次数统计每个节目对应用户的标签分布

 

 

你可能感兴趣的:(微博)