From: Shane Jaroch Date: Mon, 26 Jan 2026 16:12:48 +0000 (-0500) Subject: wip X-Git-Url: https://git.nutra.tk/v2?a=commitdiff_plain;h=d19c11324ec5c446a602a6e9580d3e7b3fc3a03a;p=nutratech%2Fsearch-server.git wip --- diff --git a/pylang_serv/parser.py b/pylang_serv/parser.py index d8a9321..9dc62a9 100644 --- a/pylang_serv/parser.py +++ b/pylang_serv/parser.py @@ -1,9 +1,10 @@ -"""Natural language ingredient parser with advanced NLP features. +"""Natural language ingredient parser with spaCy NLP. -Features: -- Multi-ingredient parsing ("2 cups flour and 1 tsp salt") -- Fuzzy food matching to database -- Optional spaCy integration for robust parsing +Uses spaCy for: +- Part-of-speech tagging to distinguish units from foods +- Noun chunk extraction for food names +- Number detection (including word numbers like "one") +- Multi-ingredient sentence splitting """ import re @@ -15,9 +16,8 @@ from rapidfuzz import fuzz, process # Unit registry for conversions ureg = pint.UnitRegistry() -# Try to load spaCy (optional) +# Try to load spaCy try: - # TODO: log warning, give some user feedback in status bar import spacy try: @@ -26,12 +26,71 @@ try: except OSError: nlp = None SPACY_AVAILABLE = False + print( + "Warning: spaCy model 'en_core_web_sm' not found. " + "Run: python -m spacy download en_core_web_sm" + ) except ImportError: nlp = None SPACY_AVAILABLE = False + print("Warning: spaCy not installed. Run: pip install spacy") + + +# Measurement units (recognized by pint or common cooking) +KNOWN_UNITS = { + # Volume + "cup", + "cups", + "c", + "tablespoon", + "tablespoons", + "tbsp", + "tbs", + "tb", + "teaspoon", + "teaspoons", + "tsp", + "ts", + "liter", + "liters", + "l", + "litre", + "litres", + "milliliter", + "milliliters", + "ml", + "fluid ounce", + "fluid ounces", + "fl oz", + "pint", + "pints", + "pt", + "quart", + "quarts", + "qt", + "gallon", + "gallons", + "gal", + # Weight + "gram", + "grams", + "g", + "kilogram", + "kilograms", + "kg", + "ounce", + "ounces", + "oz", + "pound", + "pounds", + "lb", + "lbs", + "milligram", + "milligrams", + "mg", +} - -# Common cooking unit aliases +# Unit aliases for normalization UNIT_ALIASES = { "tbsp": "tablespoon", "tbs": "tablespoon", @@ -48,7 +107,7 @@ UNIT_ALIASES = { "l": "liter", } -# Approximate density conversions (grams per cup) for common ingredients +# Density map for volume-to-weight conversion DENSITY_MAP = { "flour": 125, "all-purpose flour": 125, @@ -66,79 +125,6 @@ DENSITY_MAP = { "_default": 150, } -# Count nouns that look like units but are actually foods -# These should be treated as food, not unit (e.g., "3 eggs" → food="eggs", not unit="egg") -COUNT_NOUNS = { - "egg", - "eggs", - "apple", - "apples", - "banana", - "bananas", - "orange", - "oranges", - "clove", - "cloves", - "slice", - "slices", - "piece", - "pieces", - "stick", - "sticks", - "head", - "heads", - "bunch", - "bunches", - "sprig", - "sprigs", - "leaf", - "leaves", - "can", - "cans", - "package", - "packages", - "bag", - "bags", - "box", - "boxes", - "bottle", - "bottles", - "jar", - "jars", -} - -# Ingredient separators for multi-ingredient parsing -SEPARATORS = re.compile( - r"\s*(?:,\s*(?:and\s+)?|(?[\d./]+(?:\s+[\d./]+)?)\s*" # quantity - r"(?P[a-zA-Z]+)?\s*" # optional unit - r"(?:of\s+)?" # optional "of" - r"(?P.+?)\s*$", - re.IGNORECASE, -) - - -def parse_fraction(s: str) -> float: - """Parse a string that might be a fraction like '1/2' or '1 1/2'.""" - s = s.strip() - parts = s.split() - - if len(parts) == 2: - whole = float(parts[0]) - frac = parse_fraction(parts[1]) - return whole + frac - - if "/" in s: - num, denom = s.split("/") - return float(num) / float(denom) - - return float(s) - def normalize_unit(unit: Optional[str]) -> Optional[str]: """Normalize unit to standard form.""" @@ -148,6 +134,11 @@ def normalize_unit(unit: Optional[str]) -> Optional[str]: return UNIT_ALIASES.get(unit, unit) +def is_measurement_unit(word: str) -> bool: + """Check if a word is a known measurement unit.""" + return word.lower() in KNOWN_UNITS + + def get_grams(quantity: float, unit: Optional[str], food: str) -> Optional[float]: """Convert quantity + unit to grams if possible.""" if not unit: @@ -166,6 +157,7 @@ def get_grams(quantity: float, unit: Optional[str], food: str) -> Optional[float except (pint.UndefinedUnitError, pint.DimensionalityError): pass + # Volume to weight using density food_lower = food.lower() density = DENSITY_MAP.get(food_lower, DENSITY_MAP["_default"]) @@ -177,84 +169,108 @@ def get_grams(quantity: float, unit: Optional[str], food: str) -> Optional[float return None -def parse_single_ingredient(text: str) -> dict: - """Parse a single ingredient string into structured data.""" - text = text.strip() - if not text: - return {"error": "Empty input"} - - match = INGREDIENT_PATTERN.match(text) - if not match: - # Try spaCy if available for difficult cases - if SPACY_AVAILABLE and nlp: - return _parse_with_spacy(text) - return {"error": "Could not parse ingredient", "text": text} - - quantity_str = match.group("quantity") - unit = match.group("unit") - food = match.group("food") - - try: - quantity = parse_fraction(quantity_str) - except (ValueError, ZeroDivisionError): - return {"error": f"Invalid quantity: {quantity_str}", "text": text} - - # Check if "unit" is actually a count noun (e.g., "3 eggs" → unit="egg", food="s") - # In that case, merge unit back into food - if unit and unit.lower() in COUNT_NOUNS: - food = unit + (" " + food if food else "") - unit = None - - unit = normalize_unit(unit) - grams = get_grams(quantity, unit, food) - - result = { - "quantity": quantity, - "unit": unit, - "food": food.strip(), - } - if grams is not None: - result["grams"] = round(grams, 1) - - return result +def parse_with_spacy(text: str) -> dict: + """Parse ingredient using spaCy NLP. + Uses POS tagging to identify: + - NUM tokens → quantity + - Tokens that are known units → unit + - Remaining noun chunks → food + """ + if not SPACY_AVAILABLE or not nlp: + return {"error": "spaCy not available", "text": text} -def _parse_with_spacy(text: str) -> dict: - """Use spaCy for more complex parsing when regex fails.""" doc = nlp(text) - # Extract numbers quantity = None + unit = None + food_tokens = [] + + # Track which tokens we've consumed + consumed = set() + + # First pass: find numbers (quantity) for token in doc: - if token.like_num: + if token.like_num or token.pos_ == "NUM": try: - quantity = float(token.text) + # Handle word numbers like "one", "two" + if token.text.lower() in { + "one": 1, + "two": 2, + "three": 3, + "four": 4, + "five": 5, + "six": 6, + "seven": 7, + "eight": 8, + "nine": 9, + "ten": 10, + "dozen": 12, + }: + quantity = { + "one": 1, + "two": 2, + "three": 3, + "four": 4, + "five": 5, + "six": 6, + "seven": 7, + "eight": 8, + "nine": 9, + "ten": 10, + "dozen": 12, + }[token.text.lower()] + else: + quantity = float(token.text) + consumed.add(token.i) break except ValueError: pass - # Extract food (noun chunks) - food = None + # Second pass: find unit (must be a known measurement unit) + for token in doc: + if token.i not in consumed and is_measurement_unit(token.text): + unit = token.text.lower() + consumed.add(token.i) + break + + # Third pass: remaining nouns/noun chunks are the food for chunk in doc.noun_chunks: - food = chunk.text - break - - if not food: - # Fall back to last noun - for token in reversed(doc): - if token.pos_ == "NOUN": - food = token.text - break + # Skip chunks that only contain consumed tokens + chunk_tokens = [t for t in chunk if t.i not in consumed] + if chunk_tokens: + # Skip determiners like "a", "the" + food_text = " ".join(t.text for t in chunk_tokens if t.pos_ != "DET") + if food_text: + food_tokens.append(food_text) + for t in chunk_tokens: + consumed.add(t.i) + + # If no noun chunks, fall back to individual nouns + if not food_tokens: + for token in doc: + if token.i not in consumed and token.pos_ in ("NOUN", "PROPN"): + food_tokens.append(token.text) + consumed.add(token.i) + + # Also include adjectives that modify food (e.g., "brown sugar") + food_text = " ".join(food_tokens) if food_tokens else None + + if not food_text: + return {"error": "Could not identify food", "text": text} - if not food: - return {"error": "Could not parse ingredient", "text": text} + unit = normalize_unit(unit) + grams = get_grams(quantity or 1.0, unit, food_text) if unit else None - return { + result = { "quantity": quantity or 1.0, - "unit": None, - "food": food, - "parsed_by": "spacy", + "unit": unit, + "food": food_text, } + if grams is not None: + result["grams"] = round(grams, 1) + + return result def parse_ingredient(text: str) -> dict | list[dict]: @@ -269,19 +285,48 @@ def parse_ingredient(text: str) -> dict | list[dict]: Single ingredient: dict with quantity, unit, food, grams Multiple ingredients: list of dicts """ - # Split on separators - parts = SEPARATORS.split(text) + if not SPACY_AVAILABLE: + return { + "error": "spaCy required for parsing. Install with: pip install spacy && python -m spacy download en_core_web_sm" + } + + # Use spaCy to split on conjunctions and punctuation intelligently + doc = nlp(text) + + # Find split points (commas, "and" between ingredients) + parts = [] + current_part = [] + + for token in doc: + # Split on comma or semicolon + if token.text in (",", ";"): + if current_part: + parts.append(" ".join(t.text for t in current_part)) + current_part = [] + # Split on "and" when preceded by word (not number) - "flour and sugar" + elif ( + token.text.lower() == "and" + and current_part + and not current_part[-1].like_num + ): + parts.append(" ".join(t.text for t in current_part)) + current_part = [] + else: + current_part.append(token) + + if current_part: + parts.append(" ".join(t.text for t in current_part)) + + # Filter empty parts parts = [p.strip() for p in parts if p.strip()] - if len(parts) == 1: - return parse_single_ingredient(parts[0]) + if len(parts) == 0: + return {"error": "No ingredients found", "text": text} - results = [] - for part in parts: - result = parse_single_ingredient(part) - results.append(result) + if len(parts) == 1: + return parse_with_spacy(parts[0]) - return results + return [parse_with_spacy(part) for part in parts] class FuzzyMatcher: @@ -310,7 +355,6 @@ class FuzzyMatcher: if not self.food_names: return [] - # Use token_set_ratio for better matching of partial/reordered terms matches = process.extract( query, self.food_names, @@ -326,22 +370,6 @@ class FuzzyMatcher: return results def best_match(self, query: str) -> Optional[str]: - """Get single best matching food name. - - Args: - query: Parsed food name to match - - Returns: - Best matching name or None if no good match - """ + """Get single best matching food name.""" matches = self.match(query, limit=1) return matches[0]["name"] if matches else None - - -# For backwards compatibility -def parse_ingredient_legacy(text: str) -> dict: - """Legacy single-ingredient parser (original API).""" - result = parse_ingredient(text) - if isinstance(result, list): - return result[0] if result else {"error": "No ingredients found"} - return result