How to count the frequency of the words in a text

Here is the sample code wirtten in c#:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
using System;
using System.IO;
using System.Collections.Specialized;
using System.Collections.Generic;
using System.Text.RegularExpressions;
 
namespace CountWords
{
    class Program
    {
        static void Main(string[] args)
        {
            string fullBook = File.ReadAllText("C:\\TEMP\\Kapital.txt", System.Text.Encoding.UTF7);
            
            //remove numbers and punctuation
            fullBook = Regex.Replace(fullBook, "\\.|;|:|,|[0-9]|’", " ");
            
            //create collection of words
            var wordCollection = Regex.Matches(fullBook, @"[\w|ä]+");
            
            //calculate word frequencies
            var dict = new Dictionary<String, int>();
            for (int i = 0; i < wordCollection.Count; i++)
            {
                string word = wordCollection[i].Value;
                if (!dict.ContainsKey(word))
                    dict[word] = 1;
                else
                    ++dict[word];
            }
            Console.WriteLine("unique words : " + dict.Count);
            Console.WriteLine("total words : " + wordCollection.Count);
 
            using (StreamWriter streamWrite = new StreamWriter("C:\\TEMP\\OUTPUT.csv"))
            {
                foreach (KeyValuePair<String, int> kv in dict)
                    streamWrite.WriteLine("\"{0}\",\"{1}\"", kv.Key, kv.Value);
            }
            Console.ReadKey();
        }
    }
}
using System;
using System.IO;
using System.Collections.Specialized;
using System.Collections.Generic;
using System.Text.RegularExpressions;

namespace CountWords
{
    class Program
    {
        static void Main(string[] args)
        {
            string fullBook = File.ReadAllText("C:\\TEMP\\Kapital.txt", System.Text.Encoding.UTF7);
            
            //remove numbers and punctuation
            fullBook = Regex.Replace(fullBook, "\\.|;|:|,|[0-9]|’", " ");
            
            //create collection of words
            var wordCollection = Regex.Matches(fullBook, @"[\w|ä]+");
            
            //calculate word frequencies
            var dict = new Dictionary<String, int>();
            for (int i = 0; i < wordCollection.Count; i++)
            {
                string word = wordCollection[i].Value;
                if (!dict.ContainsKey(word))
                    dict[word] = 1;
                else
                    ++dict[word];
            }
            Console.WriteLine("unique words : " + dict.Count);
            Console.WriteLine("total words : " + wordCollection.Count);

            using (StreamWriter streamWrite = new StreamWriter("C:\\TEMP\\OUTPUT.csv"))
            {
                foreach (KeyValuePair<String, int> kv in dict)
                    streamWrite.WriteLine("\"{0}\",\"{1}\"", kv.Key, kv.Value);
            }
            Console.ReadKey();
        }
    }
}

The diagram below depicts the results of parsing the “Das Kapital” by Karl Marx (top 30 words).
Das Kapital

Leave a comment

Your email address will not be published. Required fields are marked *