当前位置:首页>>开发编程>>VS.NET>>新闻内容  
ASP.NET采集系统万能正则表达式
作者: 发布时间:2007-12-11 18:14:25 | 【字体:

由于经常要写一些采集的程序,下面的三个函数是采集中的很常用的函数。姑且叫采集系统万能正则表达式吧。

第一个://获取页面的html源码
 public  string GetHtmlSource(string Url, string charset)
        
{
            
if (charset == "" || charset == null) charset = "gb2312";
            
string text1 = "";
            
try
            
{
                HttpWebRequest request1 
= (HttpWebRequest)WebRequest.Create(Url);
                HttpWebResponse response1 
= (HttpWebResponse)request1.GetResponse();
                Stream stream1 
= response1.GetResponseStream();
                StreamReader reader1 
= new StreamReader(stream1, Encoding.GetEncoding(charset));
                text1 
= reader1.ReadToEnd();
                stream1.Close();
                response1.Close();
            }

            
catch (Exception exception1)
            
{
            }

            
return text1;
        }

第二个:截取字符串

public string SniffwebCode(string code, string wordsBegin, string wordsEnd)
        
{
            
string NewsTitle = "";
            Regex regex1 
= new Regex("" + wordsBegin + @"(?<title>[\s\S]+?)" + wordsEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase);
            
for (Match match1 = regex1.Match(code); match1.Success; match1 = match1.NextMatch())
            
{
                NewsTitle 
= match1.Groups["title"].ToString();
            }

            
return NewsTitle;

        }

第三个:截取网址

public ArrayList SniffwebCodeReturnList(string code, string wordsBegin, string wordsEnd)
        
{
            ArrayList urlList 
= new ArrayList();
            
//string NewsTitle = "";
            Regex regex1 = new Regex("" + wordsBegin + @"(?<title>[\s\S]+?)" + wordsEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase);
            
for (Match match1 = regex1.Match(code); match1.Success; match1 = match1.NextMatch())
            
{
                urlList.Add(match1.Groups[
"title"].ToString());
            }

            
return urlList;

        }

全部代码如下:

using System;
using System.Collections.Generic;
using System.Text;
using System.Data;
using System.Data.OleDb;
using System.IO;
using System.Text.RegularExpressions;
using System.Text;
using System.Collections;
using System.Net;
namespace getWeb
{
    public class DBconn
    {
      //   public string dbConnString = @"User ID=sa;Data Source=.;Password=sa;Initial Catalog=GetWeb;Provider=SQLOLEDB.1";
        public string dbConnString = @"provider=microsoft.jet.oledb.4.0;data source=Getweb.mdb";
        public static string GetSource(string Url, string charset)
        {
            if (charset == "" || charset == null) charset = "gb2312";
            string text1 = "";
            try
            {
                Stream stream1 = new WebClient().OpenRead(Url);
                text1 = new StreamReader(stream1, Encoding.GetEncoding(charset)).ReadToEnd();
                stream1.Close();
            }
            catch (Exception exception1)
            {
            }
            return text1;
        }

        public  string GetHtmlSource(string Url, string charset)
        {
            if (charset == "" || charset == null) charset = "gb2312";
            string text1 = "";
            try
            {
                HttpWebRequest request1 = (HttpWebRequest)WebRequest.Create(Url);
                HttpWebResponse response1 = (HttpWebResponse)request1.GetResponse();
                Stream stream1 = response1.GetResponseStream();
                StreamReader reader1 = new StreamReader(stream1, Encoding.GetEncoding(charset));
                text1 = reader1.ReadToEnd();
                stream1.Close();
                response1.Close();
            }
            catch (Exception exception1)
            {
            }
            return text1;
        }

        public string Get_Http(string a_strUrl, int timeout)
        {
            string strResult;

            try
            {
                HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create(a_strUrl);
                myReq.Timeout = timeout;
                HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();

                Stream myStream = HttpWResp.GetResponseStream();

                StreamReader sr = new StreamReader(myStream, Encoding.Default);
                StringBuilder strBuilder = new StringBuilder();
                while (-1 != sr.Peek())
                {
                    strBuilder.Append(sr.ReadLine() + "\r\n");
                }

                strResult = strBuilder.ToString();
            }
            catch (Exception exp)
            {
                strResult = "错误:" + exp.Message;
            }

            return strResult;

        }

        //获取页面内容后,分析页面中连接地址取到要抓取的url:
        //处理页面标题和链接
        public string SniffwebCode(string code, string wordsBegin, string wordsEnd)
        {
            string NewsTitle = "";
            Regex regex1 = new Regex("" + wordsBegin + @"(?<title>[\s\S]+?)" + wordsEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase);
            for (Match match1 = regex1.Match(code); match1.Success; match1 = match1.NextMatch())
            {
                NewsTitle = match1.Groups["title"].ToString();
            }
            return NewsTitle;

        }


        public ArrayList SniffwebCodeReturnList(string code, string wordsBegin, string wordsEnd)
        {
            ArrayList urlList = new ArrayList();
            //string NewsTitle = "";
            Regex regex1 = new Regex("" + wordsBegin + @"(?<title>[\s\S]+?)" + wordsEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase);
            for (Match match1 = regex1.Match(code); match1.Success; match1 = match1.NextMatch())
            {
                urlList.Add(match1.Groups["title"].ToString());
            }
            return urlList;

        }
    

    }
}


文章来源:
·在ASP.NET中使用AJAX的简单方法
·ASP.NET后台代码实现XmlHttp跨域访问
·ASP.NET取得当前页面的完整URL
·如何在ASP.NET程序里面执行DOS命令
·IIS5IIS6IIS7的ASP.net请求处理过程比较
·ASP.NET备份SQL Server数据库的方法
·让网站图片“另存为”“无标题.bmp”的技巧
·ASP.NET GridView的分页功能
·ASP.NET实现字母+数字方式验证码
 放生
 愚爱
 够爱
 触电
 白狐
 葬爱
 光荣
 画心
 火花
 稻香
 小酒窝
 下雨天
 右手边
 安静了
 魔杰座
 你不像她
 边做边爱
 擦肩而过
 我的答铃
 怀念过去
 等一分钟
 放手去爱
 冰河时代
 你的承诺
 自由飞翔
 原谅我一次
 吻的太逼真
 左眼皮跳跳
 做你的爱人
 一定要爱你
 飞向别人的床
 爱上别人的人
 感动天感动地
 心在跳情在烧
 玫瑰花的葬礼
 有没有人告诉你
 即使知道要见面
 爱上你是一个错
 最后一次的温柔
 爱上你是我的错
 怎么会狠心伤害我
 不是因为寂寞才想
 亲爱的那不是爱情
 难道爱一个人有错
 寂寞的时候说爱我