diff --git a/lexer.py b/lexer.py index 1b71b904838ba8bff0817cb29e293bfc9c868012..883d8ea4dca8891f2a8560ab565c755eafea1eba 100644 --- a/lexer.py +++ b/lexer.py @@ -1,41 +1,32 @@ import json -def lex(raw): +# Lexer +# Converts the input stream into a series of tokens separated by whitespace +# Returns a JSON map classifying each token +def lex(tokenStream): tokens = {} - splitted = raw.split() - - types = ['ingredients', - 'units', - 'utensils', - 'prepositions', - 'articles', - 'conjunctions', - 'adjectives'] - - with open('database/ingredients.json', 'r') as f: - ingredients = json.load(f) - - with open('database/units.json', 'r') as f: - units = json.load(f) - - while len(splitted): - currentToken = splitted[0] - - if currentToken in ingredients: - tokenType = "Ingredient" - - elif currentToken in units: - tokenType = "Unit" - - elif currentToken.isdigit(): - tokenType = "Digit" - + types = ['Ingredient', + 'Unit', + 'Utensil', + 'Preposition', + 'Article', + 'Conjunction', + 'Adjective', + 'Number', + 'Amount'] + + # open each json file as a string + # need to read each file as an array not a string + jsonFiles = {tokenType: open('database/{}.json'.format(tokenType), 'r') for tokenType in types} + with open('database/Number.json', 'r') as f: + jsonFile = f.read() + + for currentToken in tokenStream.split(): + if currentToken in jsonFiles.items(): + tokens[currentToken] = jsonFile else: - tokenType = "Unknown" - - tokens[currentToken] = tokenType - splitted = splitted[1:] - print(tokens) + tokens[currentToken] = "Unknown" + return tokens -lex("Peel 300 g of potatoes and then cut into strips") \ No newline at end of file +print(lex("Slice 3 g of Potato and then cut into strips"))