JAVA xml 流方式读取。数据挖掘大文件预处理。

import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.File;

import java.io.FileReader;

import java.io.FileWriter;

import java.io.IOException;

import java.util.HashMap;

import java.util.Iterator;

import java.util.Map;



/**

 * @author gjf

 *db_pre.arff存储的是从xml文件中读取的xml信息

 */

public class ElmAuth {

    Map<String, Integer> map = new HashMap<String, Integer>();



    //第一步

    //从xml文件中提取  读取xml文件中的author信息,写到db_pre.arff,并且替换特殊字符

    public void settleXml(String src, String dst){//src=dblp.xml dst=db_pre.arff

        File file = new File(src);

        File fl = new File(dst);

        FileReader fr;

        try {

            fr = new FileReader(file);

            FileWriter fw = new FileWriter(fl);

            BufferedReader br = new BufferedReader(fr);

            BufferedWriter bw = new BufferedWriter(fw);

            String line = null;

            boolean flag = true;

            int loc_st;

            int loc_end;

            int len = 0, max = 0;

            while((line = br.readLine()) != null){

                if(line == null) 

                    break;    

                loc_st = line.indexOf("<author>");

                if(loc_st != -1){

                    loc_end = line.indexOf("</author>");

                    line = line.substring(loc_st + 8, loc_end);//在<author></author>之间的数据,一个作者的名字

                    line=line.replace('&', ' ');

                    line=line.replace('$', ' ');

                    line=line.replace("' "," ");

                    line=line.replace("'", " ");

                    /*flag以文章为界限,在同一篇文章内,flag=false,写入在同一行*/

                    if(flag){

                        bw.write("\n");

                        bw.write(line);

                        } else {

                            bw.write(",");

                            bw.write(line);

                            }

                    len++;//每写一个作者,计数加 +1 

                    flag = false;

                    } else {

                        flag = true;

                        if(max < len) max = len;//选择最大的len;

                        len = 0;

                        bw.flush();

                        }

                }

            System.out.println("第一步 论文中具有最大的作者数:" + max);

            } catch (IOException e) {

                e.printStackTrace();

                }    

    }



    //消除只有单个作item

    //第二步:将作者的信息db_pre.arff中只有一个作者的数据删除    

    public void elimate_one(String src, String dst){//src=db_pre.arff dst=db_elone.arff

        try {

            File file = new File(src);

            FileReader fr = new FileReader(file);

            BufferedReader br = new BufferedReader(fr);

            File filew = new File(dst);

            FileWriter fw = new FileWriter(filew);

            BufferedWriter bw = new BufferedWriter(fw);

            Map<String, Integer> map = new HashMap<String, Integer>();

            String line = null;

            int k = 1;

            int res = 0;

            while((line = br.readLine()) != null){

                String[] arrLine = line.split(",");

                //作者之间用","隔离,","的数量表示作者的个数,数量比一少,则不写入.

                if(arrLine.length > 1){

                    bw.write(line);

                    bw.write("\n");

                    res ++;

                }

            } 

            bw.flush();

            br.close();

            bw.close();

            fr.close();

            //System.out.println("The Row of the file is:" + res);

            System.out.println("这篇论文中去除单个作者后的行数:" + res);

        }catch (IOException e) {

            e.printStackTrace();

        }

    }



    //将剩余的作储再hashMap中,key值为人名,value为出现的次数,支持度数 

    public void createMap(String src){//srr=db_elone.arff

        try {

            File file = new File(src);

            FileReader fr = new FileReader(file);

            BufferedReader br = new BufferedReader(fr);



            String line = null;

            while((line = br.readLine()) != null){

                if(line == null)

                    break;

                String[] arrLine = line.split(",");

                for(int i = 0; i < arrLine.length; ++i){

                    if(map.get(arrLine[i]) == null){

                        map.put(arrLine[i], 1);

                    } else {

                        map.put(arrLine[i], map.get(arrLine[i]) + 1);

                    }

                }

            }

            fr.close();

            br.close();

        } catch (IOException e) {

            e.printStackTrace();

        } 

    }



    //从hashMap中删除小于支持度minsup的作者,本次的支持度数为100;

    public void settleMap(int minsup){

        Iterator it = map.keySet().iterator();

        while(it.hasNext()){

            String str = (String) it.next();

            if(map.get(str) < minsup){

                it.remove();

            }

        }

        System.out.println("Map的大小,支持度大于100的作者个数:" + map.size());

    }



    //将大于minsup的作者存储到文件 db_minsup.arff,存储的是符合筛选的作者

    public void updateMap(String src, String dst){//src=db_elone.arff dst=db_minsup.arff

        try {

            File filer = new File(src);

            FileReader fr = new FileReader(filer);

            BufferedReader br = new BufferedReader(fr);



            File filew = new File(dst);

            FileWriter fw = new FileWriter(filew);

            BufferedWriter bw = new BufferedWriter(fw);



            String line = null;

            int res = 0;

            boolean flag = true;

            while((line = br.readLine()) != null){

                if(line == null)break;

                String[] arrLine = line.split(",");

                if(flag == false)res++;

                flag = true;

                for(int i = 0; i < arrLine.length; ++i){

                    if(map.get(arrLine[i]) != null){

                        if(flag == true){

                            bw.write("\n" + arrLine[i]);

                            flag = false;

                        } else {

                            bw.write("," + arrLine[i]);

                        }

                    }

                }

            }

            bw.flush();

            System.out.println("符合筛选的作者合作写的论文篇数:" + res);

            fw.close();

            bw.close();

            fr.close();

            br.close();

        } catch (IOException e) {

            e.printStackTrace();

        } 

    }



    //生成weka识别的文  dst=db

    public void createWekaFile(String src, String dst){//src=db_minsup.arff dst=db

        try {

            File filer = new File(src);

            FileReader fr = new FileReader(filer);

            BufferedReader br = new BufferedReader(fr);



            File filew = new File(dst);

            FileWriter fw = new FileWriter(filew);

            BufferedWriter bw = new BufferedWriter(fw);

            bw.write("@relation db" + "\n");

            Iterator it = map.keySet().iterator();

            while(it.hasNext()){

                String str = (String) it.next();

                str.replace("'", "\'");

                bw.write("@attribute '" + str + "' { t}\n");

            }

            bw.write("@data" + "\n");

            

            String line = null;

            boolean flag = true;

            while((line = br.readLine()) != null){

                if(line == null)break;

                flag = true;

                char ch;

                it = map.keySet().iterator();

                while(it.hasNext()){

                    String str = (String)it.next();

                    if(line.indexOf(str) >= 0){

                        ch = 't';

                    } else {

                        ch = '?';

                    }

                    if(flag == true){

                        bw.write(ch);

                    } else {

                        bw.write("," + ch);

                    }

                    flag = false;

                }

                bw.write("\n");

            }

            bw.flush();

            fw.close();

            bw.close();

            fr.close();

            br.close();

        } catch (IOException e) {

            // TODO Auto-generated catch block

            e.printStackTrace();

        } 

    }

    

    public void clearMap(){

        map.clear();

    }

    

    public static void main(String args[]){

        ElmAuth elmauth = new ElmAuth();

        elmauth.settleXml("dblp.xml", "db_pre.arff");

        elmauth.elimate_one("db_pre.arff", "db_elone.arff");

        elmauth.createMap("db_elone.arff");

        elmauth.settleMap(100);//确定最小支持度数

        elmauth.updateMap("db_elone.arff", "db_minsup.arff");

        

        for(int i = 0; i < 20; ++i){

            System.out.println();

            elmauth.elimate_one("db_minsup.arff", "db_minsup_elone.arff");

            elmauth.clearMap();

            elmauth.createMap("db_minsup_elone.arff");

            elmauth.settleMap(100);

            elmauth.updateMap("db_minsup_elone.arff", "db_minsup.arff");

        }

                

        elmauth.createWekaFile("db_minsup.arff", "db.arff");

    }

}

 

你可能感兴趣的:(java)