N-gram split function for string similarity comparison
- by Michael
As part of excersise to better understand F# which I am currently learning , I wrote function to
split given string into n-grams.
1) I would like to receive feedback about my function : can this be written simpler or in more efficient way?
2) My overall goal is to write function that returns string similarity (on 0.0 .. 1.0 scale) based on n-gram similarity; Does this approach works well for short strings comparisons , or can this method reliably be used to compare large strings (like articles for example).
3) I am aware of the fact that n-gram comparisons ignore context of two strings. What method would you suggest to accomplish my goal?
//s:string - target string to split into n-grams
//n:int - n-gram size to split string into
let ngram_split (s:string, n:int) =
let ngram_count = s.Length - (s.Length % n)
let ngram_list = List.init ngram_count (fun i ->
if( i + n >= s.Length ) then
s.Substring(i,s.Length - i) + String.init ((i + n) - s.Length)
(fun i -> "#")
else
s.Substring(i,n)
)
let ngram_array_unique = ngram_list
|> Seq.ofList
|> Seq.distinct
|> Array.ofSeq
//produce tuples of ngrams (ngram string,how much occurrences in original string)
Seq.init ngram_array_unique.Length (fun i -> (ngram_array_unique.[i],
ngram_list |> List.filter(fun item -> item = ngram_array_unique.[i])
|> List.length)
)