]> Nutra Git (v2) - nutratech/search-server.git/commitdiff
wip
authorShane Jaroch <chown_tee@proton.me>
Mon, 26 Jan 2026 16:12:48 +0000 (11:12 -0500)
committerShane Jaroch <chown_tee@proton.me>
Mon, 26 Jan 2026 16:12:48 +0000 (11:12 -0500)
pylang_serv/parser.py

index d8a93217f5f0bf33e3551ee4dfa1e3cfd7705a3e..9dc62a95b468be6ffe0cc3be695ec44d4c772ef4 100644 (file)
@@ -1,9 +1,10 @@
-"""Natural language ingredient parser with advanced NLP features.
+"""Natural language ingredient parser with spaCy NLP.
 
-Features:
-- Multi-ingredient parsing ("2 cups flour and 1 tsp salt")
-- Fuzzy food matching to database
-- Optional spaCy integration for robust parsing
+Uses spaCy for:
+- Part-of-speech tagging to distinguish units from foods
+- Noun chunk extraction for food names
+- Number detection (including word numbers like "one")
+- Multi-ingredient sentence splitting
 """
 
 import re
@@ -15,9 +16,8 @@ from rapidfuzz import fuzz, process
 # Unit registry for conversions
 ureg = pint.UnitRegistry()
 
-# Try to load spaCy (optional)
+# Try to load spaCy
 try:
-    # TODO: log warning, give some user feedback in status bar
     import spacy
 
     try:
@@ -26,12 +26,71 @@ try:
     except OSError:
         nlp = None
         SPACY_AVAILABLE = False
+        print(
+            "Warning: spaCy model 'en_core_web_sm' not found. "
+            "Run: python -m spacy download en_core_web_sm"
+        )
 except ImportError:
     nlp = None
     SPACY_AVAILABLE = False
+    print("Warning: spaCy not installed. Run: pip install spacy")
+
+
+# Measurement units (recognized by pint or common cooking)
+KNOWN_UNITS = {
+    # Volume
+    "cup",
+    "cups",
+    "c",
+    "tablespoon",
+    "tablespoons",
+    "tbsp",
+    "tbs",
+    "tb",
+    "teaspoon",
+    "teaspoons",
+    "tsp",
+    "ts",
+    "liter",
+    "liters",
+    "l",
+    "litre",
+    "litres",
+    "milliliter",
+    "milliliters",
+    "ml",
+    "fluid ounce",
+    "fluid ounces",
+    "fl oz",
+    "pint",
+    "pints",
+    "pt",
+    "quart",
+    "quarts",
+    "qt",
+    "gallon",
+    "gallons",
+    "gal",
+    # Weight
+    "gram",
+    "grams",
+    "g",
+    "kilogram",
+    "kilograms",
+    "kg",
+    "ounce",
+    "ounces",
+    "oz",
+    "pound",
+    "pounds",
+    "lb",
+    "lbs",
+    "milligram",
+    "milligrams",
+    "mg",
+}
 
-
-# Common cooking unit aliases
+# Unit aliases for normalization
 UNIT_ALIASES = {
     "tbsp": "tablespoon",
     "tbs": "tablespoon",
@@ -48,7 +107,7 @@ UNIT_ALIASES = {
     "l": "liter",
 }
 
-# Approximate density conversions (grams per cup) for common ingredients
+# Density map for volume-to-weight conversion
 DENSITY_MAP = {
     "flour": 125,
     "all-purpose flour": 125,
@@ -66,79 +125,6 @@ DENSITY_MAP = {
     "_default": 150,
 }
 
-# Count nouns that look like units but are actually foods
-# These should be treated as food, not unit (e.g., "3 eggs" → food="eggs", not unit="egg")
-COUNT_NOUNS = {
-    "egg",
-    "eggs",
-    "apple",
-    "apples",
-    "banana",
-    "bananas",
-    "orange",
-    "oranges",
-    "clove",
-    "cloves",
-    "slice",
-    "slices",
-    "piece",
-    "pieces",
-    "stick",
-    "sticks",
-    "head",
-    "heads",
-    "bunch",
-    "bunches",
-    "sprig",
-    "sprigs",
-    "leaf",
-    "leaves",
-    "can",
-    "cans",
-    "package",
-    "packages",
-    "bag",
-    "bags",
-    "box",
-    "boxes",
-    "bottle",
-    "bottles",
-    "jar",
-    "jars",
-}
-
-# Ingredient separators for multi-ingredient parsing
-SEPARATORS = re.compile(
-    r"\s*(?:,\s*(?:and\s+)?|(?<!\d)\s+and\s+|\s*;\s*)\s*", re.IGNORECASE
-)
-
-# Pattern to match: [quantity] [unit] [of] [food]
-INGREDIENT_PATTERN = re.compile(
-    r"^\s*"
-    r"(?P<quantity>[\d./]+(?:\s+[\d./]+)?)\s*"  # quantity
-    r"(?P<unit>[a-zA-Z]+)?\s*"  # optional unit
-    r"(?:of\s+)?"  # optional "of"
-    r"(?P<food>.+?)\s*$",
-    re.IGNORECASE,
-)
-
-
-def parse_fraction(s: str) -> float:
-    """Parse a string that might be a fraction like '1/2' or '1 1/2'."""
-    s = s.strip()
-    parts = s.split()
-
-    if len(parts) == 2:
-        whole = float(parts[0])
-        frac = parse_fraction(parts[1])
-        return whole + frac
-
-    if "/" in s:
-        num, denom = s.split("/")
-        return float(num) / float(denom)
-
-    return float(s)
-
 
 def normalize_unit(unit: Optional[str]) -> Optional[str]:
     """Normalize unit to standard form."""
@@ -148,6 +134,11 @@ def normalize_unit(unit: Optional[str]) -> Optional[str]:
     return UNIT_ALIASES.get(unit, unit)
 
 
+def is_measurement_unit(word: str) -> bool:
+    """Check if a word is a known measurement unit."""
+    return word.lower() in KNOWN_UNITS
+
+
 def get_grams(quantity: float, unit: Optional[str], food: str) -> Optional[float]:
     """Convert quantity + unit to grams if possible."""
     if not unit:
@@ -166,6 +157,7 @@ def get_grams(quantity: float, unit: Optional[str], food: str) -> Optional[float
     except (pint.UndefinedUnitError, pint.DimensionalityError):
         pass
 
+    # Volume to weight using density
     food_lower = food.lower()
     density = DENSITY_MAP.get(food_lower, DENSITY_MAP["_default"])
 
@@ -177,84 +169,108 @@ def get_grams(quantity: float, unit: Optional[str], food: str) -> Optional[float
         return None
 
 
-def parse_single_ingredient(text: str) -> dict:
-    """Parse a single ingredient string into structured data."""
-    text = text.strip()
-    if not text:
-        return {"error": "Empty input"}
-
-    match = INGREDIENT_PATTERN.match(text)
-    if not match:
-        # Try spaCy if available for difficult cases
-        if SPACY_AVAILABLE and nlp:
-            return _parse_with_spacy(text)
-        return {"error": "Could not parse ingredient", "text": text}
-
-    quantity_str = match.group("quantity")
-    unit = match.group("unit")
-    food = match.group("food")
-
-    try:
-        quantity = parse_fraction(quantity_str)
-    except (ValueError, ZeroDivisionError):
-        return {"error": f"Invalid quantity: {quantity_str}", "text": text}
-
-    # Check if "unit" is actually a count noun (e.g., "3 eggs" → unit="egg", food="s")
-    # In that case, merge unit back into food
-    if unit and unit.lower() in COUNT_NOUNS:
-        food = unit + (" " + food if food else "")
-        unit = None
-
-    unit = normalize_unit(unit)
-    grams = get_grams(quantity, unit, food)
-
-    result = {
-        "quantity": quantity,
-        "unit": unit,
-        "food": food.strip(),
-    }
-    if grams is not None:
-        result["grams"] = round(grams, 1)
-
-    return result
+def parse_with_spacy(text: str) -> dict:
+    """Parse ingredient using spaCy NLP.
 
+    Uses POS tagging to identify:
+    - NUM tokens → quantity
+    - Tokens that are known units → unit
+    - Remaining noun chunks → food
+    """
+    if not SPACY_AVAILABLE or not nlp:
+        return {"error": "spaCy not available", "text": text}
 
-def _parse_with_spacy(text: str) -> dict:
-    """Use spaCy for more complex parsing when regex fails."""
     doc = nlp(text)
 
-    # Extract numbers
     quantity = None
+    unit = None
+    food_tokens = []
+
+    # Track which tokens we've consumed
+    consumed = set()
+
+    # First pass: find numbers (quantity)
     for token in doc:
-        if token.like_num:
+        if token.like_num or token.pos_ == "NUM":
             try:
-                quantity = float(token.text)
+                # Handle word numbers like "one", "two"
+                if token.text.lower() in {
+                    "one": 1,
+                    "two": 2,
+                    "three": 3,
+                    "four": 4,
+                    "five": 5,
+                    "six": 6,
+                    "seven": 7,
+                    "eight": 8,
+                    "nine": 9,
+                    "ten": 10,
+                    "dozen": 12,
+                }:
+                    quantity = {
+                        "one": 1,
+                        "two": 2,
+                        "three": 3,
+                        "four": 4,
+                        "five": 5,
+                        "six": 6,
+                        "seven": 7,
+                        "eight": 8,
+                        "nine": 9,
+                        "ten": 10,
+                        "dozen": 12,
+                    }[token.text.lower()]
+                else:
+                    quantity = float(token.text)
+                consumed.add(token.i)
                 break
             except ValueError:
                 pass
 
-    # Extract food (noun chunks)
-    food = None
+    # Second pass: find unit (must be a known measurement unit)
+    for token in doc:
+        if token.i not in consumed and is_measurement_unit(token.text):
+            unit = token.text.lower()
+            consumed.add(token.i)
+            break
+
+    # Third pass: remaining nouns/noun chunks are the food
     for chunk in doc.noun_chunks:
-        food = chunk.text
-        break
-
-    if not food:
-        # Fall back to last noun
-        for token in reversed(doc):
-            if token.pos_ == "NOUN":
-                food = token.text
-                break
+        # Skip chunks that only contain consumed tokens
+        chunk_tokens = [t for t in chunk if t.i not in consumed]
+        if chunk_tokens:
+            # Skip determiners like "a", "the"
+            food_text = " ".join(t.text for t in chunk_tokens if t.pos_ != "DET")
+            if food_text:
+                food_tokens.append(food_text)
+                for t in chunk_tokens:
+                    consumed.add(t.i)
+
+    # If no noun chunks, fall back to individual nouns
+    if not food_tokens:
+        for token in doc:
+            if token.i not in consumed and token.pos_ in ("NOUN", "PROPN"):
+                food_tokens.append(token.text)
+                consumed.add(token.i)
+
+    # Also include adjectives that modify food (e.g., "brown sugar")
+    food_text = " ".join(food_tokens) if food_tokens else None
+
+    if not food_text:
+        return {"error": "Could not identify food", "text": text}
 
-    if not food:
-        return {"error": "Could not parse ingredient", "text": text}
+    unit = normalize_unit(unit)
+    grams = get_grams(quantity or 1.0, unit, food_text) if unit else None
 
-    return {
+    result = {
         "quantity": quantity or 1.0,
-        "unit": None,
-        "food": food,
-        "parsed_by": "spacy",
+        "unit": unit,
+        "food": food_text,
     }
+    if grams is not None:
+        result["grams"] = round(grams, 1)
+
+    return result
 
 
 def parse_ingredient(text: str) -> dict | list[dict]:
@@ -269,19 +285,48 @@ def parse_ingredient(text: str) -> dict | list[dict]:
         Single ingredient: dict with quantity, unit, food, grams
         Multiple ingredients: list of dicts
     """
-    # Split on separators
-    parts = SEPARATORS.split(text)
+    if not SPACY_AVAILABLE:
+        return {
+            "error": "spaCy required for parsing. Install with: pip install spacy && python -m spacy download en_core_web_sm"
+        }
+
+    # Use spaCy to split on conjunctions and punctuation intelligently
+    doc = nlp(text)
+
+    # Find split points (commas, "and" between ingredients)
+    parts = []
+    current_part = []
+
+    for token in doc:
+        # Split on comma or semicolon
+        if token.text in (",", ";"):
+            if current_part:
+                parts.append(" ".join(t.text for t in current_part))
+                current_part = []
+        # Split on "and" when preceded by word (not number) - "flour and sugar"
+        elif (
+            token.text.lower() == "and"
+            and current_part
+            and not current_part[-1].like_num
+        ):
+            parts.append(" ".join(t.text for t in current_part))
+            current_part = []
+        else:
+            current_part.append(token)
+
+    if current_part:
+        parts.append(" ".join(t.text for t in current_part))
+
+    # Filter empty parts
     parts = [p.strip() for p in parts if p.strip()]
 
-    if len(parts) == 1:
-        return parse_single_ingredient(parts[0])
+    if len(parts) == 0:
+        return {"error": "No ingredients found", "text": text}
 
-    results = []
-    for part in parts:
-        result = parse_single_ingredient(part)
-        results.append(result)
+    if len(parts) == 1:
+        return parse_with_spacy(parts[0])
 
-    return results
+    return [parse_with_spacy(part) for part in parts]
 
 
 class FuzzyMatcher:
@@ -310,7 +355,6 @@ class FuzzyMatcher:
         if not self.food_names:
             return []
 
-        # Use token_set_ratio for better matching of partial/reordered terms
         matches = process.extract(
             query,
             self.food_names,
@@ -326,22 +370,6 @@ class FuzzyMatcher:
         return results
 
     def best_match(self, query: str) -> Optional[str]:
-        """Get single best matching food name.
-
-        Args:
-            query: Parsed food name to match
-
-        Returns:
-            Best matching name or None if no good match
-        """
+        """Get single best matching food name."""
         matches = self.match(query, limit=1)
         return matches[0]["name"] if matches else None
-
-
-# For backwards compatibility
-def parse_ingredient_legacy(text: str) -> dict:
-    """Legacy single-ingredient parser (original API)."""
-    result = parse_ingredient(text)
-    if isinstance(result, list):
-        return result[0] if result else {"error": "No ingredients found"}
-    return result