Chapter 26 - Regular Expressions
Regular Expression Basics
Simple Quantifier
Regex.Match ("color", @"colou?r").Success.Dump(); Regex.Match ("colour", @"colou?r").Success.Dump(); Regex.Match ("colouur", @"colou?r").Success.Dump();
The Match object
Regex.Match ("any colour you like", @"colou?r")
Using NextMatch
Match m1 = Regex.Match ("One color? There are two colours in my head!", @"colou?rs?"); Match m2 = m1.NextMatch(); m1.Dump ("Match 1"); m2.Dump ("Match 2");
The Matches Method
foreach (Match m in Regex.Matches ("One color? There are two colours in my head!", @"colou?rs?")) m.Value.Dump();
Alternators
string r = "Jen(ny|nifer)?"; Regex.IsMatch ("Jenny", r).Dump(); Regex.IsMatch ("Jennifer", r).Dump(); Regex.IsMatch ("Jen", r).Dump(); Regex.IsMatch ("Ben", r).Dump();
Compiled Expressions
Regex r = new Regex (@"sausages?", RegexOptions.Compiled); r.Match ("sausage").Success.Dump(); r.Match ("sausages").Success.Dump();
RegexOptions
Regex.Match ("a", "A", RegexOptions.IgnoreCase).Value.Dump(); Regex.Match ("a", @"(?i)A").Value.Dump(); Regex.Match ("AAAa", @"(?i)a(?-i)a").Value.Dump();
Character Escapes
// The Regex metacharacters are as follows: // // \ * + ? | { [ ( ) ^ $ . # Regex.Match ("what?", @"what\?").Value.Dump ("Correct"); Regex.Match ("what?", @"what?").Value.Dump ("Incorrect"); Regex.Escape (@"?").Dump ("Escape"); Regex.Unescape (@"\?").Dump ("Unescape"); Regex.IsMatch ("hello world", @"hello world").Dump ("Are spaces significant?"); Regex.IsMatch ("hello world", @"(?x) hello world").Dump ("Are spaces are significant?");
Character Sets
Regex.Matches ("That is that.", "[Tt]hat").Count .Dump ("Matches any of a set"); Regex.Match ("quiz qwerty", "q[^aeiou]").Index .Dump ("Matches any except those of a set"); Regex.Match ("b1-c4", @"[a-h]\d-[a-h]\d").Success .Dump ("Matches a range"); Regex.IsMatch ("Yes, please", @"\p{P}") .Dump ("Matches character category");
Quantifiers
Quantifiers
Regex.Match ("cv15.docx", @"cv\d*\.docx").Success.Dump(); Regex.Match ("cvjoint.docx", @"cv.*\.docx").Success.Dump(); Regex.Matches ("slow! yeah slooow!", "slo+w").Count.Dump(); Regex bp = new Regex (@"\d{2,3}/\d{2,3}"); bp.Match ("It used to be 160/110").Value.Dump(); bp.Match ("Now it's only 115/75").Value.Dump();
Greedy verus Lazy
string html = "<i>By default</i> quantifiers are <i>greedy</i> creatures"; foreach (Match m in Regex.Matches (html, @"<i>.*</i>")) m.Value.Dump ("Greedy"); foreach (Match m in Regex.Matches (html, @"<i>.*?</i>")) m.Value.Dump ("Lazy");
Zero Width Assertions
Lookahead
Regex.Match ("say 25 miles more", @"\d+\s(?=miles)").Value.Dump(); Regex.Match ("say 25 miles more", @"\d+\s(?=miles).*").Value.Dump(); string password = "blahblah3"; Regex.IsMatch (password, @"(?=.*\d).{6,}").Dump ("Password is strong"); password = "blahblaha"; Regex.IsMatch (password, @"(?=.*\d).{6,}").Dump ("Password is strong"); string regex = "(?i)good(?!.*(however|but))"; Regex.IsMatch ("Good work! But...", regex).Dump ("Negative lookahead"); Regex.IsMatch ("Good work! Thanks!", regex).Dump ("Negative lookahead"); string fileNames = "a.txt" + "\r\n" + "b.docx" + "\r\n" + "c.txt"; string r = @".+\.txt(?=\r?$)"; foreach (Match m in Regex.Matches (fileNames, r, RegexOptions.Multiline)) Console.Write (m + " ");
Lookbehind
string regex = "(?i)(?<!however.*)good"; Regex.IsMatch ("However good, we...", regex).Dump(); Regex.IsMatch ("Very good, thanks!" , regex).Dump();
Anchors
Regex.Match ("Not now", "^[Nn]o").Value.Dump(); Regex.Match ("f = 0.2F", "[Ff]$").Value.Dump();
Anchors - Handling End of Lines
string fileNames = "a.txt" + "\r\n" + "b.doc" + "\r\n" + "c.txt"; string r = @".+\.txt(?=\r?$)"; foreach (Match m in Regex.Matches (fileNames, r, RegexOptions.Multiline)) Console.Write (m + " ");
Anchors - Empty Lines
string s = @"The second to last line has some spaces in it!"; MatchCollection emptyLines = Regex.Matches (s, "^(?=\r?$)", RegexOptions.Multiline); emptyLines.Count.Dump(); MatchCollection blankLines = Regex.Matches (s, "^[ \t]*(?=\r?$)", RegexOptions.Multiline); blankLines.Count.Dump();
Word Boundaries
foreach (Match m in Regex.Matches ("Wedding in Sarajevo", @"\b\w+\b")) m.Value.Dump(); Regex.Matches ("Wedding in Sarajevo", @"\bin\b").Count.Dump ("With the word boundary operator"); Regex.Matches ("Wedding in Sarajevo", @"in").Count.Dump ("Without the word boundary operator"); string text = "Don't loose (sic) your cool"; Regex.Match (text, @"\b\w+\b\s(?=\(sic\))").Value.Dump();
Groups
Groups
Match m = Regex.Match ("206-465-1918", @"(\d{3})-(\d{3}-\d{4})"); m.Groups[0].Value.Dump(); m.Groups[1].Value.Dump(); m.Groups[2].Value.Dump(); Console.WriteLine(); foreach (Match ma in Regex.Matches ("pop pope peep", @"\b(\w)\w+\1\b")) Console.Write (ma + " ");
Named Groups
string regEx = @"\b" + // word boundary @"(?'letter'\w)" + // match first letter, and name it 'letter' @"\w+" + // match middle letters @"\k'letter'" + // match last letter, denoted by 'letter' @"\b"; // word boundary foreach (Match m in Regex.Matches ("bob pope peep", regEx)) Console.Write (m + " ");
Named Groups - XML tag
string regFind = @"<(?'tag'\w+?).*>" + // match first tag, and name it 'tag' @"(?'text'.*?)" + // match text content, name it 'text' @"</\k'tag'>"; // match last tag, denoted by 'tag' Match m = Regex.Match ("<h1>hello</h1>", regFind); m.Groups ["tag"].Value.Dump(); m.Groups ["text"].Value.Dump();
Replacing and Splitting Text
Simple Replacement
string find = @"\bcat\b"; string replace = "dog"; Regex.Replace ("catapult the cat", find, replace).Dump();
Referencing the Original String
string text = "10 plus 20 makes 30"; Regex.Replace (text, @"\d+", @"<$0>").Dump();
Updating an XML tag
string regFind = @"<(?'tag'\w+?).*>" + // match first tag, and name it 'tag' @"(?'text'.*?)" + // match text content, name it 'text' @"</\k'tag'>"; // match last tag, denoted by 'tag' string regReplace = @"<${tag}" + // <tag @" value=""" + // value=" @"${text}" + // text @"""/>"; // "/> Regex.Replace ("<msg>hello</msg>", regFind, regReplace).Dump();
Using MatchEvaluator
Regex.Replace ( "5 is less than 10", @"\d+", m => (int.Parse (m.Value) * 10).ToString() )
Splitting Text
foreach (string s in Regex.Split ("a5b7c", @"\d")) Console.Write (s + " "); Console.WriteLine(); foreach (string s in Regex.Split ("oneTwoThree", @"(?=[A-Z])")) Console.Write (s + " ");
Regex Cookbook
Matching a US Phone or Social Security Number
string ssNum = @"\d{3}-\d{2}-\d{4}"; Console.WriteLine (Regex.IsMatch ("123-45-6789", ssNum)); // True string phone = @"(?x) ( \d{3}[-\s] | \(\d{3}\)\s? ) \d{3}[-\s]? \d{4}"; Console.WriteLine (Regex.IsMatch ("123-456-7890", phone)); // True Console.WriteLine (Regex.IsMatch ("(123) 456-7890", phone)); // True
Extracting Name=Value pairs
string r = @"(?m)^\s*(?'name'\w+)\s*=\s*(?'value'.*)\s*(?=\r?$)"; string text = @"id = 3 secure = true timeout = 30"; foreach (Match m in Regex.Matches (text, r)) Console.WriteLine (m.Groups["name"] + " is " + m.Groups["value"]);
Strong Password Validation
string r = @"(?x)" + // Ignore spaces within regex expression, for readability @"^" + // Anchor at start of string @"(?=.* ( \d | \p{P} | \p{S} ))" + // String must contain a digit or punctuation char or symbol @".{6,}"; // String must be at least 6 characters in length Console.WriteLine (Regex.IsMatch ("abc12", r)); Console.WriteLine (Regex.IsMatch ("abcdef", r)); Console.WriteLine (Regex.IsMatch ("ab88yz", r));
Lines at least n characters
string r = @"(?m)^.{80,}(?=\r?$)"; string fifty = new string ('x', 50); string eighty = new string ('x', 80); string text = eighty + "\r\n" + fifty + "\r\n" + eighty; Console.WriteLine (Regex.Matches (text, r).Count);
Parsing Dates and Times
string r = @"(?x)(?i) (\d{1,4}) [./-] (\d{1,2}) [./-] (\d{1,4}) [\sT] (\d+):(\d+):(\d+) \s? (A\.?M\.?|P\.?M\.?)?"; string text = "01/02/2008 5:20:50 PM"; foreach (Group g in Regex.Match (text, r).Groups) Console.WriteLine (g.Value + " ");
Matching Roman Numerals
string r = @"(?i)\bm*" + @"(d?c{0,3}|c[dm])" + @"(l?x{0,3}|x[lc])" + @"(v?i{0,3}|i[vx])" + @"\b"; Console.WriteLine (Regex.IsMatch ("MCMLXXXIV", r));
Removing Repeated Words
string r = @"(?'dupe'\w+)\W\k'dupe'"; string text = "In the the beginning..."; Console.WriteLine (Regex.Replace (text, r, "${dupe}"));
Replacing newline with return-newline
// This replaces \n with \r\n without breaking existing \r\n occurrences. string n = "\n"; string rn = "\r\n"; string text = "L1" + n + "L2" + rn + "L3"; string result = Regex.Replace (text, "(?<!\r)\n", "\r\n"); result.Select (c => new { c, Code = (int) c } ).Dump();
Word Count
string r = @"\b(\w|[-'])+\b"; string text = "It's all mumbo-jumbo to me"; Console.WriteLine (Regex.Matches (text, r).Count);
Matching a GUID
string r = @"(?i)\b" + @"[0-9a-fA-F]{8}\-" + @"[0-9a-fA-F]{4}\-" + @"[0-9a-fA-F]{4}\-" + @"[0-9a-fA-F]{4}\-" + @"[0-9a-fA-F]{12}" + @"\b"; string text = "Its key is {3F2504E0-4F89-11D3-9A0C-0305E82C3301}."; Console.WriteLine (Regex.Match (text, r).Index);
Parsing an XML tag
string r = @"<(?'tag'\w+?).*>" + // match first tag, and name it 'tag' @"(?'text'.*?)" + // match text content, name it 'text' @"</\k'tag'>"; // match last tag, denoted by 'tag' string text = "<h1>hello</h1>"; Match m = Regex.Match (text, r); Console.WriteLine (m.Groups ["tag"].Value); Console.WriteLine (m.Groups ["text"].Value);
Splitting a Camel-Cased Word
string r = @"(?=[A-Z])"; foreach (string s in Regex.Split ("oneTwoThree", r)) Console.Write (s + " ");
Obtaining a Legal Filename
string input = "My \"good\" <recipes>.txt"; char[] invalidChars = System.IO.Path.GetInvalidFileNameChars(); string invalidString = Regex.Escape (new string (invalidChars)); string valid = Regex.Replace (input, "[" + invalidString + "]", ""); Console.WriteLine (valid);
Escaping Unicode Characters for HTML
string htmlFragment = "? 2007"; string result = Regex.Replace ( htmlFragment, @"[\u0080-\uFFFF]", m => @"&#" + ((int)m.Value[0]).ToString() + ";"); Console.WriteLine (result);
Unescaping Characters in an HTTP Query String
string sample = "C%23 in a Nutshell"; string result = Regex.Replace ( sample, @"%[0-9a-f][0-9a-f]", m => ((char) Convert.ToByte (m.Value.Substring (1), 16)).ToString(), RegexOptions.IgnoreCase ); Console.WriteLine (result);
Parsing Google Search Terms from a Web Stats Log
string sample = "http://www.google.com/search?hl=en&q=greedy+quantifiers+regex&btnG=Search"; Match m = Regex.Match (sample, @"(?<=google\..+search\?.*q=).+?(?=(&|$))"); string[] keywords = m.Value.Split (new[] { '+' }, StringSplitOptions.RemoveEmptyEntries); keywords.Dump(); // Note: this may need to be used in conunction with the previous // example, i.e. "Unescaping Characters in an HTTP Query String".