C# Web Crawler for links
This code will crawl a web page and gather urls (hyperlinks) and their titles into collections and bind them to a datagrid. The code uses Regular Expressions (Regex), HttpWebRequest, WebRequest, WebResponse and more.
View original source code here
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Text.RegularExpressions;
using System.Collections.ObjectModel;
using System.Net;
namespace Regexer
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private string Fixx(string x)
{
return x.Replace(""", @"""").Replace(">", ">").Replace("<", "<");
}
private void button1_Click(object sender, EventArgs e)
{
//string testData = ReadStream("http://www.google.com/search?as_q=test+results&hl=en&num=100&btnG=Google+Search", 9000);
string testData = ReadStream("http://del.icio.us/", 99000);
Regex test = new Regex(Fixx(@"<[aA][\s]+[^>]*?[hH][rR][eE][fF][\s]?=[\s\"\']+(.*?)[\"\']+.*?>([^<]+|.*?)?<\/[aA]>";));
//The regex on the above line will be mangled, click the image below to see what it should be.
MatchCollection mc = test.Matches(testData);
Collection<UrlData> uColl = new Collection<UrlData>();
foreach (Match mt in mc)
{
UrlData url = new UrlData();
url.Url = mt.Groups[1].Value;
url.Title = mt.Groups[2].Value;
uColl.Add(url);
}
dataGridView1.DataSource = uColl;
dataGridView1.AutoResizeColumn(1);
}
public static string ReadStream(string url, int timeOut)
{
String result;
WebResponse objResponse;
WebRequest objRequest = System.Net.HttpWebRequest.Create(url);
objRequest.Timeout = timeOut;
objResponse = objRequest.GetResponse();
using (System.IO.StreamReader sr =
new System.IO.StreamReader(objResponse.GetResponseStream()))
{
result = sr.ReadToEnd();
sr.Close();
}
return result;
}
}
public class UrlData
{
private string url = null;
private string title = null;
public string Url
{
get { return url; }
set { url = value; }
}
public string Title
{
get { return title; }
set { title = value; }
}
}
}
2 Comments:
Cool stuff.
Another example: http://arachnode.net/content/CrawlerBasics.aspx
C# Code
Post a Comment
<< Home