Monday, December 25, 2006

C# Web Crawler for links

This code will crawl a web page and gather urls (hyperlinks) and their titles into collections and bind them to a datagrid. The code uses Regular Expressions (Regex), HttpWebRequest, WebRequest, WebResponse and more.

View original source code here

using System;
using
System.Collections.Generic;
using
System.ComponentModel;
using
System.Data;
using
System.Drawing;
using
System.Text;
using
System.Windows.Forms;
using
System.Text.RegularExpressions;
using
System.Collections.ObjectModel;
using
System.Net;

namespace
Regexer
{
public partial class Form1 : Form
{
public Form1()

{
InitializeComponent()
;
}

private string Fixx(string x)
{
return x.Replace(""", @"""").Replace(">", ">").Replace("<", "<");
}

private void button1_Click(object sender, EventArgs e)

{
//string testData = ReadStream("http://www.google.com/search?as_q=test+results&hl=en&num=100&btnG=Google+Search", 9000);

string testData = ReadStream("http://del.icio.us/", 99000);

Regex test = new Regex(Fixx(@"<[aA][\s]+[^>]*?[hH][rR][eE][fF][\s]?=[\s\"\']+(.*?)[\"\']+.*?>([^<]+|.*?)?<\/[aA]>";));
//The regex on the above line will be mangled, click the image below to see what it should be.


MatchCollection mc
= test.Matches(testData);
Collection<UrlData> uColl = new Collection<UrlData>();

foreach
(Match mt in mc)

{
UrlData url
= new UrlData();
url.Url = mt.Groups[1].Value;
url.Title = mt.Groups[2].Value;
uColl.Add(url);
}

dataGridView1.DataSource
= uColl;
dataGridView1.AutoResizeColumn(1);
}

public static string ReadStream(string url, int timeOut)
{
String result
;
WebResponse objResponse;
WebRequest objRequest = System.Net.HttpWebRequest.Create(url);
objRequest.Timeout = timeOut;

objResponse = objRequest.GetResponse();

using
(System.IO.StreamReader sr =
new
System.IO.StreamReader(objResponse.GetResponseStream()))
{
result
= sr.ReadToEnd();
sr.Close();
}
return result;
}
}

public class UrlData
{
private string url = null;
private string
title = null;

public string
Url
{
get { return url; }

set { url = value; }
}

public string Title
{
get { return title; }
set { title = value; }
}
}

}