asp.net抓取数据(一)

本文简单介绍如何抓取单页的数据

先根据传入URL获取网页源码(注:这里有一个编码的问题,现在还不完善,不能自动判断编码)

//获取网页源码
        public static string Get_Http(string a_strUrl, int timeout)
        {
            string strResult;
            try
            {
                HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create(a_strUrl);
                
                myReq.Timeout = timeout;
                HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();
                Stream myStream = HttpWResp.GetResponseStream();
                Encoding encoding = Encoding.GetEncoding("UTF-8");
                //如果是gb2312编码
                StreamReader sr = new StreamReader(myStream, Encoding.Default);
                //如果是utf-8编码
                //StreamReader sr = new StreamReader(myStream, Encoding.UTF8);
                StringBuilder strBuilder = new StringBuilder();
                while (-1 != sr.Peek())
                {
                    strBuilder.Append(sr.ReadLine() + "/r/n");
                }
                strResult = strBuilder.ToString();
            }
            catch (Exception exp)
            {
                strResult = "错误:" + exp.Message;
            }
            return strResult;
        }
挑出标题、时间、内容转换成实体类 (这里进行正则匹配)

//转换成实体
        public static MsgInfo getinfomation(string strhtml, string strbtstart, string strbtend,string strsjstart,string strsjend,string strnrstart,string strnrend) {
            string retitle = string.Format("{0}(?<g>(.|[\r\n])+?){1}", strbtstart, strbtend);//匹配标题
            string redate = string.Format("{0}(?<g>(.|[\r\n])+?){1}", strsjstart, strsjend);//匹配日期
            string recontent = string.Format("{0}(?<g>(.|[\r\n])+?){1}", strnrstart, strnrend);//匹配正文
            string title = Regex.Match(strhtml, retitle).Groups["g"].Value;
            string date = Regex.Match(strhtml, redate).Groups["g"].Value;
            string contents = Regex.Match(strhtml, recontent).Groups["g"].Value;
            MsgInfo msg = new MsgInfo();
            msg.title = title;
            msg.pubdate = Convert.ToDateTime(date);
            msg.content = contents;
            return msg;
        }



你可能感兴趣的:(exception,Date,Stream,String,asp.net,encoding)