Here is the sample code wirtten in c#:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | using System; using System.IO; using System.Collections.Specialized; using System.Collections.Generic; using System.Text.RegularExpressions; namespace CountWords { class Program { static void Main(string[] args) { string fullBook = File.ReadAllText("C:\\TEMP\\Kapital.txt", System.Text.Encoding.UTF7); //remove numbers and punctuation fullBook = Regex.Replace(fullBook, "\\.|;|:|,|[0-9]|’", " "); //create collection of words var wordCollection = Regex.Matches(fullBook, @"[\w|ä]+"); //calculate word frequencies var dict = new Dictionary<String, int>(); for (int i = 0; i < wordCollection.Count; i++) { string word = wordCollection[i].Value; if (!dict.ContainsKey(word)) dict[word] = 1; else ++dict[word]; } Console.WriteLine("unique words : " + dict.Count); Console.WriteLine("total words : " + wordCollection.Count); using (StreamWriter streamWrite = new StreamWriter("C:\\TEMP\\OUTPUT.csv")) { foreach (KeyValuePair<String, int> kv in dict) streamWrite.WriteLine("\"{0}\",\"{1}\"", kv.Key, kv.Value); } Console.ReadKey(); } } } |
using System; using System.IO; using System.Collections.Specialized; using System.Collections.Generic; using System.Text.RegularExpressions; namespace CountWords { class Program { static void Main(string[] args) { string fullBook = File.ReadAllText("C:\\TEMP\\Kapital.txt", System.Text.Encoding.UTF7); //remove numbers and punctuation fullBook = Regex.Replace(fullBook, "\\.|;|:|,|[0-9]|’", " "); //create collection of words var wordCollection = Regex.Matches(fullBook, @"[\w|ä]+"); //calculate word frequencies var dict = new Dictionary<String, int>(); for (int i = 0; i < wordCollection.Count; i++) { string word = wordCollection[i].Value; if (!dict.ContainsKey(word)) dict[word] = 1; else ++dict[word]; } Console.WriteLine("unique words : " + dict.Count); Console.WriteLine("total words : " + wordCollection.Count); using (StreamWriter streamWrite = new StreamWriter("C:\\TEMP\\OUTPUT.csv")) { foreach (KeyValuePair<String, int> kv in dict) streamWrite.WriteLine("\"{0}\",\"{1}\"", kv.Key, kv.Value); } Console.ReadKey(); } } }
The diagram below depicts the results of parsing the “Das Kapital” by Karl Marx (top 30 words).