hey guys i have a textfile i have divided it into 4 parts. i want to search each part for the words that appear in each part and score that word
exmaple
welcome to the national basketball finals,the basketball teams here today have come a long way. without much delay lets play basketball.
i will want to return national = 1 as it appears only in one part etc
am working on determining text context using word position.
am working with c# and not very good in text processing
basically
if a word appears in the 4 sections it scores 4
if a word appears in the 3 sections it scores 3
if a word appears in the 2 sections it scores 2
if a word appears in the 1 section it scores 1
thanks in advance
so far i have this
var s = "welcome to the national basketball finals,the basketball teams here today have come a long way. without much delay lets play basketball. ";
var numberOfParts = 4;
var eachPartLength = s.Length / numberOfParts;
var parts = new List<string>();
var words = Regex.Split(s, @"\W").Where(w => w.Length > 0); // this splits all words, removes empty strings
var wordsIndex = 0;
for (int i = 0; i < numberOfParts; i++)
{
var sb = new StringBuilder();
while (sb.Length < eachPartLength && wordsIndex < words.Count())
{
sb.AppendFormat("{0} ", words.ElementAt(wordsIndex));
wordsIndex++;
}
// here you have the part
Response.Write("[{0}]"+ sb);
parts.Add(sb.ToString());
var allwords = parts.SelectMany(p => p.Split(' ').Distinct());
var wordsInAllParts = allwords.Where(w => parts.All(p => p.Contains(w))).Distinct();