From: Shane Jaroch Date: Mon, 26 Jan 2026 16:16:16 +0000 (-0500) Subject: keep working X-Git-Url: https://git.nutra.tk/v2?a=commitdiff_plain;ds=inline;p=nutratech%2Fsearch-server.git keep working --- diff --git a/pylang_serv/parser.py b/pylang_serv/parser.py index 9dc62a9..2e4473d 100644 --- a/pylang_serv/parser.py +++ b/pylang_serv/parser.py @@ -1,10 +1,10 @@ -"""Natural language ingredient parser with spaCy NLP. +"""Natural language ingredient parser. -Uses spaCy for: -- Part-of-speech tagging to distinguish units from foods -- Noun chunk extraction for food names -- Number detection (including word numbers like "one") -- Multi-ingredient sentence splitting +Features: +- Multi-ingredient parsing ("2 cups flour and 1 tsp salt") +- Fuzzy food matching to database +- Robust regex-based parsing (no spaCy dependency required) +- Optional spaCy integration for enhanced parsing (if available) """ import re @@ -16,24 +16,16 @@ from rapidfuzz import fuzz, process # Unit registry for conversions ureg = pint.UnitRegistry() -# Try to load spaCy +# Try to load spaCy (optional, may not work on Python 3.14+) +SPACY_AVAILABLE = False +nlp = None try: import spacy - try: - nlp = spacy.load("en_core_web_sm") - SPACY_AVAILABLE = True - except OSError: - nlp = None - SPACY_AVAILABLE = False - print( - "Warning: spaCy model 'en_core_web_sm' not found. " - "Run: python -m spacy download en_core_web_sm" - ) -except ImportError: - nlp = None - SPACY_AVAILABLE = False - print("Warning: spaCy not installed. Run: pip install spacy") + nlp = spacy.load("en_core_web_sm") + SPACY_AVAILABLE = True +except Exception: + pass # Fall back to regex parsing # Measurement units (recognized by pint or common cooking) @@ -59,9 +51,6 @@ KNOWN_UNITS = { "milliliter", "milliliters", "ml", - "fluid ounce", - "fluid ounces", - "fl oz", "pint", "pints", "pt", @@ -107,6 +96,27 @@ UNIT_ALIASES = { "l": "liter", } +# Word numbers to float +WORD_NUMBERS = { + "one": 1, + "two": 2, + "three": 3, + "four": 4, + "five": 5, + "six": 6, + "seven": 7, + "eight": 8, + "nine": 9, + "ten": 10, + "eleven": 11, + "twelve": 12, + "dozen": 12, + "half": 0.5, + "quarter": 0.25, + "a": 1, + "an": 1, +} + # Density map for volume-to-weight conversion DENSITY_MAP = { "flour": 125, @@ -125,6 +135,16 @@ DENSITY_MAP = { "_default": 150, } +# Regex patterns +FRACTION_PATTERN = re.compile(r"(\d+)\s*/\s*(\d+)") +MIXED_NUMBER_PATTERN = re.compile(r"(\d+)\s+(\d+)\s*/\s*(\d+)") +NUMBER_PATTERN = re.compile(r"\d+\.?\d*") + +# Ingredient separators +SEPARATORS = re.compile( + r"\s*(?:,\s*(?:and\s+)?|(? Optional[str]: """Normalize unit to standard form.""" @@ -139,6 +159,44 @@ def is_measurement_unit(word: str) -> bool: return word.lower() in KNOWN_UNITS +def parse_quantity(text: str) -> tuple[Optional[float], str]: + """Extract quantity from start of text. + + Returns: + Tuple of (quantity, remaining_text) + """ + text = text.strip() + + # Try mixed number first: "1 1/2" + match = MIXED_NUMBER_PATTERN.match(text) + if match: + whole = float(match.group(1)) + num = float(match.group(2)) + denom = float(match.group(3)) + return whole + num / denom, text[match.end() :].strip() + + # Try fraction: "1/2" + match = FRACTION_PATTERN.match(text) + if match: + num = float(match.group(1)) + denom = float(match.group(2)) + return num / denom, text[match.end() :].strip() + + # Try regular number + match = NUMBER_PATTERN.match(text) + if match: + return float(match.group()), text[match.end() :].strip() + + # Try word number + words = text.split() + if words: + first_word = words[0].lower() + if first_word in WORD_NUMBERS: + return WORD_NUMBERS[first_word], " ".join(words[1:]) + + return None, text + + def get_grams(quantity: float, unit: Optional[str], food: str) -> Optional[float]: """Convert quantity + unit to grams if possible.""" if not unit: @@ -169,103 +227,37 @@ def get_grams(quantity: float, unit: Optional[str], food: str) -> Optional[float return None -def parse_with_spacy(text: str) -> dict: - """Parse ingredient using spaCy NLP. - - Uses POS tagging to identify: - - NUM tokens → quantity - - Tokens that are known units → unit - - Remaining noun chunks → food - """ - if not SPACY_AVAILABLE or not nlp: - return {"error": "spaCy not available", "text": text} +def parse_single_ingredient(text: str) -> dict: + """Parse a single ingredient string into structured data.""" + text = text.strip() + if not text: + return {"error": "Empty input"} - doc = nlp(text) + # Extract quantity + quantity, remaining = parse_quantity(text) - quantity = None + # Extract unit (if the next word is a known unit) unit = None - food_tokens = [] - - # Track which tokens we've consumed - consumed = set() - - # First pass: find numbers (quantity) - for token in doc: - if token.like_num or token.pos_ == "NUM": - try: - # Handle word numbers like "one", "two" - if token.text.lower() in { - "one": 1, - "two": 2, - "three": 3, - "four": 4, - "five": 5, - "six": 6, - "seven": 7, - "eight": 8, - "nine": 9, - "ten": 10, - "dozen": 12, - }: - quantity = { - "one": 1, - "two": 2, - "three": 3, - "four": 4, - "five": 5, - "six": 6, - "seven": 7, - "eight": 8, - "nine": 9, - "ten": 10, - "dozen": 12, - }[token.text.lower()] - else: - quantity = float(token.text) - consumed.add(token.i) - break - except ValueError: - pass - - # Second pass: find unit (must be a known measurement unit) - for token in doc: - if token.i not in consumed and is_measurement_unit(token.text): - unit = token.text.lower() - consumed.add(token.i) - break - - # Third pass: remaining nouns/noun chunks are the food - for chunk in doc.noun_chunks: - # Skip chunks that only contain consumed tokens - chunk_tokens = [t for t in chunk if t.i not in consumed] - if chunk_tokens: - # Skip determiners like "a", "the" - food_text = " ".join(t.text for t in chunk_tokens if t.pos_ != "DET") - if food_text: - food_tokens.append(food_text) - for t in chunk_tokens: - consumed.add(t.i) - - # If no noun chunks, fall back to individual nouns - if not food_tokens: - for token in doc: - if token.i not in consumed and token.pos_ in ("NOUN", "PROPN"): - food_tokens.append(token.text) - consumed.add(token.i) - - # Also include adjectives that modify food (e.g., "brown sugar") - food_text = " ".join(food_tokens) if food_tokens else None - - if not food_text: + words = remaining.split() + if words and is_measurement_unit(words[0]): + unit = words[0] + remaining = " ".join(words[1:]) + + # Remove common filler words + remaining = re.sub(r"^(of|the)\s+", "", remaining, flags=re.IGNORECASE) + + food = remaining.strip() + + if not food: return {"error": "Could not identify food", "text": text} unit = normalize_unit(unit) - grams = get_grams(quantity or 1.0, unit, food_text) if unit else None + grams = get_grams(quantity or 1.0, unit, food) if unit else None result = { "quantity": quantity or 1.0, "unit": unit, - "food": food_text, + "food": food, } if grams is not None: result["grams"] = round(grams, 1) @@ -285,48 +277,17 @@ def parse_ingredient(text: str) -> dict | list[dict]: Single ingredient: dict with quantity, unit, food, grams Multiple ingredients: list of dicts """ - if not SPACY_AVAILABLE: - return { - "error": "spaCy required for parsing. Install with: pip install spacy && python -m spacy download en_core_web_sm" - } - - # Use spaCy to split on conjunctions and punctuation intelligently - doc = nlp(text) - - # Find split points (commas, "and" between ingredients) - parts = [] - current_part = [] - - for token in doc: - # Split on comma or semicolon - if token.text in (",", ";"): - if current_part: - parts.append(" ".join(t.text for t in current_part)) - current_part = [] - # Split on "and" when preceded by word (not number) - "flour and sugar" - elif ( - token.text.lower() == "and" - and current_part - and not current_part[-1].like_num - ): - parts.append(" ".join(t.text for t in current_part)) - current_part = [] - else: - current_part.append(token) - - if current_part: - parts.append(" ".join(t.text for t in current_part)) - - # Filter empty parts + # Split on separators + parts = SEPARATORS.split(text) parts = [p.strip() for p in parts if p.strip()] if len(parts) == 0: return {"error": "No ingredients found", "text": text} if len(parts) == 1: - return parse_with_spacy(parts[0]) + return parse_single_ingredient(parts[0]) - return [parse_with_spacy(part) for part in parts] + return [parse_single_ingredient(part) for part in parts] class FuzzyMatcher: