Published 10/28/2009 by Tim Eisenhauer - Last updated on 11/18/2010 by Raghav Khunger
CODE SNIPPIT - C# Parse Meta Tags
You may come across an instance in your C# and ASP.NET programming where you need to download an external webpage and parse the meta tags... specifically, the "Title," "Meta Description," and "Meta Keywords."The method below will show you how to: * download an external webpage * parse the meta title * parse the meta description * parse the meta keywordsThe parsing is done using regular expressions.NOTE: This may not be the best way of doing this, but it is a solution that you can use.
using System; using System.Collections.Generic; using System.Text; using System.Net; using System.Text.RegularExpressions; using System.IO; namespace Tim.Examples.Classes { public class WebMetaData { public string metaTitle; public string metaDescription; public string metaKeywords; public bool GetMetaTags(string url) { try{ //get the HTML of the given page and put into a string string html = AcquireHTML(url); if (GetMeta(html)) { return true; } else { return false; } } catch(Exception ex) { // do something with the error return false; } } private string AcquireHTML(string address) { HttpWebRequest request; HttpWebResponse response = null; StreamReader reader; StringBuilder sbSource; try { // Create and initialize the web request request = System.Net.WebRequest.Create(address) as HttpWebRequest; request.UserAgent = "your-search-bot"; request.KeepAlive = false; request.Timeout = 10 * 1000; // Get response response = request.GetResponse() as HttpWebResponse; if (request.HaveResponse == true && response != null) { // Get the response stream reader = new StreamReader(response.GetResponseStream()); // Read it into a StringBuilder sbSource = new StringBuilder(reader.ReadToEnd()); response.Close(); // Console application output return sbSource.ToString(); } else return ""; } catch (Exception ex) { response.Close(); return ""; } } private bool GetMeta(string strIn) { try { // --- Parse the title Match TitleMatch = Regex.Match(strIn, "<title>([^<]*)</title>, RegexOptions.IgnoreCase | RegexOptions.Multiline); metaTitle = TitleMatch.Groups[1].Value; // --- Parse the meta keywords Match KeywordMatch = Regex.Match(strIn, "<meta name=\"keywords\" content=\"([^<]*)\">", RegexOptions.IgnoreCase | RegexOptions.Multiline); metaKeywords = KeywordMatch.Groups[1].Value; // --- Parse the meta description Match DescriptionMatch = Regex.Match(strIn, "<meta name=\"description\" content=\"([^<]*)\">", RegexOptions.IgnoreCase | RegexOptions.Multiline); metaDescription = DescriptionMatch.Groups[1].Value; return true; } catch (Exception ex) { // do something with the error return false; } } } }