tokenize - Page 3 - Developer IT

How could I refactor this into more manageable methods?

- by ChaosPandion

private static JsonStructure Parse(string jsonText, bool throwException) { var result = default(JsonStructure); var structureStack = new Stack<JsonStructure>(); var keyStack = new Stack<string>(); var current = default(JsonStructure); var currentState = ParserState.Begin; var invalidToken = false; var key = default(string); var value = default(object); foreach (var token in Lexer.Tokenize(jsonText)) { switch (currentState) { case ParserState.Begin: switch (token.Type) { case TokenType.OpenBrace: currentState = ParserState.ObjectKey; current = result = new JsonObject(); break; case TokenType.OpenBracket: currentState = ParserState.ArrayValue; current = result = new JsonArray(); break; default: invalidToken = true; break; } break; case ParserState.ObjectKey: switch (token.Type) { case TokenType.StringLiteral: currentState = ParserState.ColonSeperator; key = (string)token.Value; break; default: invalidToken = true; break; } break; case ParserState.ColonSeperator: switch (token.Type) { case TokenType.Colon: currentState = ParserState.ObjectValue; break; default: invalidToken = true; break; } break; case ParserState.ObjectValue: case ParserState.ArrayValue: switch (token.Type) { case TokenType.NumberLiteral: case TokenType.StringLiteral: case TokenType.BooleanLiteral: case TokenType.NullLiteral: currentState = ParserState.ItemEnd; value = token.Value; break; case TokenType.OpenBrace: structureStack.Push(current); keyStack.Push(key); currentState = ParserState.ObjectKey; current = new JsonObject(); break; case TokenType.OpenBracket: structureStack.Push(current); currentState = ParserState.ArrayValue; current = new JsonArray(); break; default: invalidToken = true; break; } break; case ParserState.ItemEnd: var jsonObject = (current as JsonObject); if (jsonObject != null) { jsonObject.Add(key, value); currentState = ParserState.ObjectKey; } var jsonArray = (current as JsonArray); if (jsonArray != null) { jsonArray.Add(value); currentState = ParserState.ArrayValue; } switch (token.Type) { case TokenType.CloseBrace: case TokenType.CloseBracket: currentState = ParserState.End; break; case TokenType.Comma: break; default: invalidToken = true; break; } break; case ParserState.End: switch (token.Type) { case TokenType.CloseBrace: case TokenType.CloseBracket: case TokenType.Comma: var previous = structureStack.Pop(); var previousJsonObject = (previous as JsonObject); if (previousJsonObject != null) { currentState = ParserState.ObjectKey; previousJsonObject.Add(keyStack.Pop(), current); } var previousJsonArray = (previous as JsonArray); if (previousJsonArray != null) { currentState = ParserState.ArrayValue; previousJsonArray.Add(current); } current = previous; if (token.Type != TokenType.Comma) { currentState = ParserState.End; } break; default: invalidToken = true; break; } break; default: break; } if (invalidToken) { if (throwException) { throw new JsonException(token); } return null; } } return result; }

Read the article

How could I refactor this into more manageable code?

- by ChaosPandion

private static JsonStructure Parse(string jsonText, bool throwException) { var result = default(JsonStructure); var structureStack = new Stack<JsonStructure>(); var keyStack = new Stack<string>(); var current = default(JsonStructure); var currentState = ParserState.Begin; var invalidToken = false; var key = default(string); var value = default(object); foreach (var token in Lexer.Tokenize(jsonText)) { switch (currentState) { case ParserState.Begin: switch (token.Type) { case TokenType.OpenBrace: currentState = ParserState.ObjectKey; current = result = new JsonObject(); break; case TokenType.OpenBracket: currentState = ParserState.ArrayValue; current = result = new JsonArray(); break; default: invalidToken = true; break; } break; case ParserState.ObjectKey: switch (token.Type) { case TokenType.StringLiteral: currentState = ParserState.ColonSeperator; key = (string)token.Value; break; default: invalidToken = true; break; } break; case ParserState.ColonSeperator: switch (token.Type) { case TokenType.Colon: currentState = ParserState.ObjectValue; break; default: invalidToken = true; break; } break; case ParserState.ObjectValue: case ParserState.ArrayValue: switch (token.Type) { case TokenType.NumberLiteral: case TokenType.StringLiteral: case TokenType.BooleanLiteral: case TokenType.NullLiteral: currentState = ParserState.ItemEnd; value = token.Value; break; case TokenType.OpenBrace: structureStack.Push(current); keyStack.Push(key); currentState = ParserState.ObjectKey; current = new JsonObject(); break; case TokenType.OpenBracket: structureStack.Push(current); currentState = ParserState.ArrayValue; current = new JsonArray(); break; default: invalidToken = true; break; } break; case ParserState.ItemEnd: var jsonObject = (current as JsonObject); if (jsonObject != null) { jsonObject.Add(key, value); currentState = ParserState.ObjectKey; } var jsonArray = (current as JsonArray); if (jsonArray != null) { jsonArray.Add(value); currentState = ParserState.ArrayValue; } switch (token.Type) { case TokenType.CloseBrace: case TokenType.CloseBracket: currentState = ParserState.End; break; case TokenType.Comma: break; default: invalidToken = true; break; } break; case ParserState.End: switch (token.Type) { case TokenType.CloseBrace: case TokenType.CloseBracket: case TokenType.Comma: var previous = structureStack.Pop(); var previousJsonObject = (previous as JsonObject); if (previousJsonObject != null) { currentState = ParserState.ObjectKey; previousJsonObject.Add(keyStack.Pop(), current); } var previousJsonArray = (previous as JsonArray); if (previousJsonArray != null) { currentState = ParserState.ArrayValue; previousJsonArray.Add(current); } current = previous; if (token.Type != TokenType.Comma) { currentState = ParserState.End; } break; default: invalidToken = true; break; } break; default: break; } if (invalidToken) { if (throwException) { throw new JsonException(token); } return null; } } return result; }

Read the article

algorithm q: Fuzzy matching of structured data

- by user86432

I have a fairly small corpus of structured records sitting in a database. Given a tiny fraction of the information contained in a single record, submitted via a web form (so structured in the same way as the table schema), (let us call it the test record) I need to quickly draw up a list of the records that are the most likely matches for the test record, as well as provide a confidence estimate of how closely the search terms match a record. The primary purpose of this search is to discover whether someone is attempting to input a record that is duplicate to one in the corpus. There is a reasonable chance that the test record will be a dupe, and a reasonable chance the test record will not be a dupe. The records are about 12000 bytes wide and the total count of records is about 150,000. There are 110 columns in the table schema and 95% of searches will be on the top 5% most commonly searched columns. The data is stuff like names, addresses, telephone numbers, and other industry specific numbers. In both the corpus and the test record it is entered by hand and is semistructured within an individual field. You might at first blush say "weight the columns by hand and match word tokens within them", but it's not so easy. I thought so too: if I get a telephone number I thought that would indicate a perfect match. The problem is that there isn't a single field in the form whose token frequency does not vary by orders of magnitude. A telephone number might appear 100 times in the corpus or 1 time in the corpus. The same goes for any other field. This makes weighting at the field level impractical. I need a more fine-grained approach to get decent matching. My initial plan was to create a hash of hashes, top level being the fieldname. Then I would select all of the information from the corpus for a given field, attempt to clean up the data contained in it, and tokenize the sanitized data, hashing the tokens at the second level, with the tokens as keys and frequency as value. I would use the frequency count as a weight: the higher the frequency of a token in the reference corpus, the less weight I attach to that token if it is found in the test record. My first question is for the statisticians in the room: how would I use the frequency as a weight? Is there a precise mathematical relationship between n, the number of records, f(t), the frequency with which a token t appeared in the corpus, the probability o that a record is an original and not a duplicate, and the probability p that the test record is really a record x given the test and x contain the same t in the same field? How about the relationship for multiple token matches across multiple fields? Since I sincerely doubt that there is, is there anything that gets me close but is better than a completely arbitrary hack full of magic factors? Barring that, has anyone got a way to do this? I'm especially keen on other suggestions that do not involve maintaining another table in the database, such as a token frequency lookup table :). This is my first post on StackOverflow, thanks in advance for any replies you may see fit to give.

Read the article

Error with Phoenix placeholder _val in Boost.Spirit.Lex :(

- by GooRoo

Hello, everybody. I'm newbie in Boost.Spirit.Lex. Some strange error appears every time I try to use lex::_val in semantics actions in my simple lexer: #ifndef _TOKENS_H_ #define _TOKENS_H_ #include <iostream> #include <string> #include <boost/spirit/include/lex_lexertl.hpp> #include <boost/spirit/include/phoenix_operator.hpp> #include <boost/spirit/include/phoenix_statement.hpp> #include <boost/spirit/include/phoenix_container.hpp> namespace lex = boost::spirit::lex; namespace phx = boost::phoenix; enum tokenids { ID_IDENTIFICATOR = 1, ID_CONSTANT, ID_OPERATION, ID_BRACKET, ID_WHITESPACES }; template <typename Lexer> struct mega_tokens : lex::lexer<Lexer> { mega_tokens() : identifier(L"[a-zA-Z_][a-zA-Z0-9_]*", ID_IDENTIFICATOR) , constant (L"[0-9]+(\\.[0-9]+)?", ID_CONSTANT ) , operation (L"[\\+\\-\\*/]", ID_OPERATION ) , bracket (L"[\\(\\)\\[\\]]", ID_BRACKET ) { using lex::_tokenid; using lex::_val; using phx::val; this->self = operation [ std::wcout << val(L'<') << _tokenid // << val(L':') << lex::_val << val(L'>') ] | identifier [ std::wcout << val(L'<') << _tokenid << val(L':') << _val << val(L'>') ] | constant [ std::wcout << val(L'<') << _tokenid // << val(L':') << _val << val(L'>') ] | bracket [ std::wcout << phx::val(lex::_val) << val(L'<') << _tokenid // << val(L':') << lex::_val << val(L'>') ] ; } lex::token_def<wchar_t, wchar_t> operation; lex::token_def<std::wstring, wchar_t> identifier; lex::token_def<double, wchar_t> constant; lex::token_def<wchar_t, wchar_t> bracket; }; #endif // _TOKENS_H_ and #include <cstdlib> #include <iostream> #include <locale> #include <boost/spirit/include/lex_lexertl.hpp> #include "tokens.h" int main() { setlocale(LC_ALL, "Russian"); namespace lex = boost::spirit::lex; typedef std::wstring::iterator base_iterator; typedef lex::lexertl::token < base_iterator, boost::mpl::vector<wchar_t, std::wstring, double, wchar_t>, boost::mpl::true_ > token_type; typedef lex::lexertl::actor_lexer<token_type> lexer_type; typedef mega_tokens<lexer_type>::iterator_type iterator_type; mega_tokens<lexer_type> mega_lexer; std::wstring str = L"alfa+x1*(2.836-x2[i])"; base_iterator first = str.begin(); bool r = lex::tokenize(first, str.end(), mega_lexer); if (r) { std::wcout << L"Success" << std::endl; } else { std::wstring rest(first, str.end()); std::wcerr << L"Lexical analysis failed\n" << L"stopped at: \"" << rest << L"\"\n"; } return EXIT_SUCCESS; } This code causes an error in Boost header 'boost/spirit/home/lex/argument.hpp' on line 167 while compiling: return: can't convert 'const boost::variant' to 'boost::variant &' When I don't use lex::_val program compiles with no errors. Obviously, I use _val in wrong way, but I do not know how to do this correctly. Help, please! :) P.S. And sorry for my terrible English…

Read the article

Parsing "true" and "false" using Boost.Spirit.Lex and Boost.Spirit.Qi

- by Andrew Ross

As the first stage of a larger grammar using Boost.Spirit I'm trying to parse "true" and "false" to produce the corresponding bool values, true and false. I'm using Spirit.Lex to tokenize the input and have a working implementation for integer and floating point literals (including those expressed in a relaxed scientific notation), exposing int and float attributes. Token definitions #include <boost/spirit/include/lex_lexertl.hpp> namespace lex = boost::spirit::lex; typedef boost::mpl::vector<int, float, bool> token_value_type; template <typename Lexer> struct basic_literal_tokens : lex::lexer<Lexer> { basic_literal_tokens() { this->self.add_pattern("INT", "[-+]?[0-9]+"); int_literal = "{INT}"; // To be lexed as a float a numeric literal must have a decimal point // or include an exponent, otherwise it will be considered an integer. float_literal = "{INT}(((\\.[0-9]+)([eE]{INT})?)|([eE]{INT}))"; literal_true = "true"; literal_false = "false"; this->self = literal_true | literal_false | float_literal | int_literal; } lex::token_def<int> int_literal; lex::token_def<float> float_literal; lex::token_def<bool> literal_true, literal_false; }; Testing parsing of float literals My real implementation uses Boost.Test, but this is a self-contained example. #include <string> #include <iostream> #include <cmath> #include <cstdlib> #include <limits> bool parse_and_check_float(std::string const & input, float expected) { typedef std::string::const_iterator base_iterator_type; typedef lex::lexertl::token<base_iterator_type, token_value_type > token_type; typedef lex::lexertl::lexer<token_type> lexer_type; basic_literal_tokens<lexer_type> basic_literal_lexer; base_iterator_type input_iter(input.begin()); float actual; bool result = lex::tokenize_and_parse(input_iter, input.end(), basic_literal_lexer, basic_literal_lexer.float_literal, actual); return result && std::abs(expected - actual) < std::numeric_limits<float>::epsilon(); } int main(int argc, char *argv[]) { if (parse_and_check_float("+31.4e-1", 3.14)) { return EXIT_SUCCESS; } else { return EXIT_FAILURE; } } Parsing "true" and "false" My problem is when trying to parse "true" and "false". This is the test code I'm using (after removing the Boost.Test parts): bool parse_and_check_bool(std::string const & input, bool expected) { typedef std::string::const_iterator base_iterator_type; typedef lex::lexertl::token<base_iterator_type, token_value_type > token_type; typedef lex::lexertl::lexer<token_type> lexer_type; basic_literal_tokens<lexer_type> basic_literal_lexer; base_iterator_type input_iter(input.begin()); bool actual; lex::token_def<bool> parser = expected ? basic_literal_lexer.literal_true : basic_literal_lexer.literal_false; bool result = lex::tokenize_and_parse(input_iter, input.end(), basic_literal_lexer, parser, actual); return result && actual == expected; } but compilation fails with: boost/spirit/home/qi/detail/assign_to.hpp: In function ‘void boost::spirit::traits::assign_to(const Iterator&, const Iterator&, Attribute&) [with Iterator = __gnu_cxx::__normal_iterator<const char*, std::basic_string<char, std::char_traits<char>, std::allocator<char> > >, Attribute = bool]’: boost/spirit/home/lex/lexer/lexertl/token.hpp:434: instantiated from ‘static void boost::spirit::traits::assign_to_attribute_from_value<Attribute, boost::spirit::lex::lexertl::token<Iterator, AttributeTypes, HasState>, void>::call(const boost::spirit::lex::lexertl::token<Iterator, AttributeTypes, HasState>&, Attribute&) [with Attribute = bool, Iterator = __gnu_cxx::__normal_iterator<const char*, std::basic_string<char, std::char_traits<char>, std::allocator<char> > >, AttributeTypes = boost::mpl::vector<int, float, bool, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, HasState = mpl_::bool_<true>]’ ... backtrace of instantiation points .... boost/spirit/home/qi/detail/assign_to.hpp:79: error: no matching function for call to ‘boost::spirit::traits::assign_to_attribute_from_iterators<bool, __gnu_cxx::__normal_iterator<const char*, std::basic_string<char, std::char_traits<char>, std::allocator<char> > >, void>::call(const __gnu_cxx::__normal_iterator<const char*, std::basic_string<char, std::char_traits<char>, std::allocator<char> > >&, const __gnu_cxx::__normal_iterator<const char*, std::basic_string<char, std::char_traits<char>, std::allocator<char> > >&, bool&)’ boost/spirit/home/qi/detail/construct.hpp:64: note: candidates are: static void boost::spirit::traits::assign_to_attribute_from_iterators<bool, Iterator, void>::call(const Iterator&, const Iterator&, char&) [with Iterator = __gnu_cxx::__normal_iterator<const char*, std::basic_string<char, std::char_traits<char>, std::allocator<char> > >] My interpretation of this is that Spirit.Qi doesn't know how to convert a string to a bool - surely that's not the case? Has anyone else done this before? If so, how?

Read the article

Strange (Undefined?) Behavior of Free in C

- by Chris Cirefice

This is really strange... and I can't debug it (tried for about two hours, debugger starts going haywire after a while...). Anyway, I'm trying to do something really simple: Free an array of strings. The array is in the form: char **myStrings. The array elements are initialized as: myString[index] = malloc(strlen(word)); myString[index] = word; and I'm calling a function like this: free_memory(myStrings, size); where size is the length of the array (I know this is not the problem, I tested it extensively and everything except this function is working). free_memory looks like this: void free_memory(char **list, int size) { for (int i = 0; i < size; i ++) { free(list[i]); } free(list); } Now here comes the weird part. if (size> strlen(list[i])) then the program crashes. For example, imagine that I have a list of strings that looks something like this: myStrings[0] = "Some"; myStrings[1] = "random"; myStrings[2] = "strings"; And thus the length of this array is 3. If I pass this to my free_memory function, strlen(myStrings[0]) > 3 (4 3), and the program crashes. However, if I change myStrings[0] to be "So" instead, then strlen(myStrings[0]) < 3 (2 < 3) and the program does not crash. So it seems to me that free(list[i]) is actually going through the char[] that is at that location and trying to free each character, which I imagine is undefined behavior. The only reason I say this is because I can play around with the size of the first element of myStrings and make the program crash whenever I feel like it, so I'm assuming that this is the problem area. Note: I did try to debug this by stepping through the function that calls free_memory, noting any weird values and such, but the moment I step into the free_memory function, the debugger crashes, so I'm not really sure what is going on. Nothing is out of the ordinary until I enter the function, then the world explodes. Another note: I also posted the shortened version of the source for this program (not too long; Pastebin) here. I am compiling on MinGW with the c99 flag on. PS - I just thought of this. I am indeed passing numUniqueWords to the free function, and I know that this does not actually free the entire piece of memory that I allocated. I've called it both ways, that's not the issue. And I left it how I did because that is the way that I will be calling it after I get it to work in the first place, I need to revise some of my logic in that function. Source, as per request (on-site): #include <stdio.h> #include <string.h> #include <ctype.h> #include <stdlib.h> #include "words.h" int getNumUniqueWords(char text[], int size); int main(int argc, char* argv[]) { setvbuf(stdout, NULL, 4, _IONBF); // For Eclipse... stupid bug. --> does NOT affect the program, just the output to console! int nbr_words; char text[] = "Some - \"text, a stdin\". We'll have! also repeat? We'll also have a repeat!"; int length = sizeof(text); nbr_words = getNumUniqueWords(text, length); return 0; } void free_memory(char **list, int size) { for (int i = 0; i < size; i ++) { // You can see that printing the values is fine, as long as free is not called. // When free is called, the program will crash if (size > strlen(list[i])) //printf("Wanna free value %d w/len of %d: %s\n", i, strlen(list[i]), list[i]); free(list[i]); } free(list); } int getNumUniqueWords(char text[], int length) { int numTotalWords = 0; char *word; printf("Length: %d characters\n", length); char totalWords[length]; strcpy(totalWords, text); word = strtok(totalWords, " ,.-!?()\"0123456789"); while (word != NULL) { numTotalWords ++; printf("%s\n", word); word = strtok(NULL, " ,.-!?()\"0123456789"); } printf("Looks like we counted %d total words\n\n", numTotalWords); char *uniqueWords[numTotalWords]; char *tempWord; int wordAlreadyExists = 0; int numUniqueWords = 0; char totalWordsCopy[length]; strcpy(totalWordsCopy, text); for (int i = 0; i < numTotalWords; i++) { uniqueWords[i] = NULL; } // Tokenize until all the text is consumed. word = strtok(totalWordsCopy, " ,.-!?()\"0123456789"); while (word != NULL) { // Look through the word list for the current token. for (int j = 0; j < numTotalWords; j ++) { // Just for clarity, no real meaning. tempWord = uniqueWords[j]; // The word list is either empty or the current token is not in the list. if (tempWord == NULL) { break; } //printf("Comparing (%s) with (%s)\n", tempWord, word); // If the current token is the same as the current element in the word list, mark and break if (strcmp(tempWord, word) == 0) { printf("\nDuplicate: (%s)\n\n", word); wordAlreadyExists = 1; break; } } // Word does not exist, add it to the array. if (!wordAlreadyExists) { uniqueWords[numUniqueWords] = malloc(strlen(word)); uniqueWords[numUniqueWords] = word; numUniqueWords ++; printf("Unique: %s\n", word); } // Reset flags and continue. wordAlreadyExists = 0; word = strtok(NULL, " ,.-!?()\"0123456789"); } // Print out the array just for funsies - make sure it's working properly. for (int x = 0; x <numUniqueWords; x++) { printf("Unique list %d: %s\n", x, uniqueWords[x]); } printf("\nNumber of unique words: %d\n\n", numUniqueWords); // Right below is where things start to suck. free_memory(uniqueWords, numUniqueWords); return numUniqueWords; }

Developer IT