-"""Natural language ingredient parser with advanced NLP features.
+"""Natural language ingredient parser with spaCy NLP.
-Features:
-- Multi-ingredient parsing ("2 cups flour and 1 tsp salt")
-- Fuzzy food matching to database
-- Optional spaCy integration for robust parsing
+Uses spaCy for:
+- Part-of-speech tagging to distinguish units from foods
+- Noun chunk extraction for food names
+- Number detection (including word numbers like "one")
+- Multi-ingredient sentence splitting
"""
import re
# Unit registry for conversions
ureg = pint.UnitRegistry()
-# Try to load spaCy (optional)
+# Try to load spaCy
try:
- # TODO: log warning, give some user feedback in status bar
import spacy
try:
except OSError:
nlp = None
SPACY_AVAILABLE = False
+ print(
+ "Warning: spaCy model 'en_core_web_sm' not found. "
+ "Run: python -m spacy download en_core_web_sm"
+ )
except ImportError:
nlp = None
SPACY_AVAILABLE = False
+ print("Warning: spaCy not installed. Run: pip install spacy")
+
+
+# Measurement units (recognized by pint or common cooking)
+KNOWN_UNITS = {
+ # Volume
+ "cup",
+ "cups",
+ "c",
+ "tablespoon",
+ "tablespoons",
+ "tbsp",
+ "tbs",
+ "tb",
+ "teaspoon",
+ "teaspoons",
+ "tsp",
+ "ts",
+ "liter",
+ "liters",
+ "l",
+ "litre",
+ "litres",
+ "milliliter",
+ "milliliters",
+ "ml",
+ "fluid ounce",
+ "fluid ounces",
+ "fl oz",
+ "pint",
+ "pints",
+ "pt",
+ "quart",
+ "quarts",
+ "qt",
+ "gallon",
+ "gallons",
+ "gal",
+ # Weight
+ "gram",
+ "grams",
+ "g",
+ "kilogram",
+ "kilograms",
+ "kg",
+ "ounce",
+ "ounces",
+ "oz",
+ "pound",
+ "pounds",
+ "lb",
+ "lbs",
+ "milligram",
+ "milligrams",
+ "mg",
+}
-
-# Common cooking unit aliases
+# Unit aliases for normalization
UNIT_ALIASES = {
"tbsp": "tablespoon",
"tbs": "tablespoon",
"l": "liter",
}
-# Approximate density conversions (grams per cup) for common ingredients
+# Density map for volume-to-weight conversion
DENSITY_MAP = {
"flour": 125,
"all-purpose flour": 125,
"_default": 150,
}
-# Count nouns that look like units but are actually foods
-# These should be treated as food, not unit (e.g., "3 eggs" → food="eggs", not unit="egg")
-COUNT_NOUNS = {
- "egg",
- "eggs",
- "apple",
- "apples",
- "banana",
- "bananas",
- "orange",
- "oranges",
- "clove",
- "cloves",
- "slice",
- "slices",
- "piece",
- "pieces",
- "stick",
- "sticks",
- "head",
- "heads",
- "bunch",
- "bunches",
- "sprig",
- "sprigs",
- "leaf",
- "leaves",
- "can",
- "cans",
- "package",
- "packages",
- "bag",
- "bags",
- "box",
- "boxes",
- "bottle",
- "bottles",
- "jar",
- "jars",
-}
-
-# Ingredient separators for multi-ingredient parsing
-SEPARATORS = re.compile(
- r"\s*(?:,\s*(?:and\s+)?|(?<!\d)\s+and\s+|\s*;\s*)\s*", re.IGNORECASE
-)
-
-# Pattern to match: [quantity] [unit] [of] [food]
-INGREDIENT_PATTERN = re.compile(
- r"^\s*"
- r"(?P<quantity>[\d./]+(?:\s+[\d./]+)?)\s*" # quantity
- r"(?P<unit>[a-zA-Z]+)?\s*" # optional unit
- r"(?:of\s+)?" # optional "of"
- r"(?P<food>.+?)\s*$",
- re.IGNORECASE,
-)
-
-
-def parse_fraction(s: str) -> float:
- """Parse a string that might be a fraction like '1/2' or '1 1/2'."""
- s = s.strip()
- parts = s.split()
-
- if len(parts) == 2:
- whole = float(parts[0])
- frac = parse_fraction(parts[1])
- return whole + frac
-
- if "/" in s:
- num, denom = s.split("/")
- return float(num) / float(denom)
-
- return float(s)
-
def normalize_unit(unit: Optional[str]) -> Optional[str]:
"""Normalize unit to standard form."""
return UNIT_ALIASES.get(unit, unit)
+def is_measurement_unit(word: str) -> bool:
+ """Check if a word is a known measurement unit."""
+ return word.lower() in KNOWN_UNITS
+
+
def get_grams(quantity: float, unit: Optional[str], food: str) -> Optional[float]:
"""Convert quantity + unit to grams if possible."""
if not unit:
except (pint.UndefinedUnitError, pint.DimensionalityError):
pass
+ # Volume to weight using density
food_lower = food.lower()
density = DENSITY_MAP.get(food_lower, DENSITY_MAP["_default"])
return None
-def parse_single_ingredient(text: str) -> dict:
- """Parse a single ingredient string into structured data."""
- text = text.strip()
- if not text:
- return {"error": "Empty input"}
-
- match = INGREDIENT_PATTERN.match(text)
- if not match:
- # Try spaCy if available for difficult cases
- if SPACY_AVAILABLE and nlp:
- return _parse_with_spacy(text)
- return {"error": "Could not parse ingredient", "text": text}
-
- quantity_str = match.group("quantity")
- unit = match.group("unit")
- food = match.group("food")
-
- try:
- quantity = parse_fraction(quantity_str)
- except (ValueError, ZeroDivisionError):
- return {"error": f"Invalid quantity: {quantity_str}", "text": text}
-
- # Check if "unit" is actually a count noun (e.g., "3 eggs" → unit="egg", food="s")
- # In that case, merge unit back into food
- if unit and unit.lower() in COUNT_NOUNS:
- food = unit + (" " + food if food else "")
- unit = None
-
- unit = normalize_unit(unit)
- grams = get_grams(quantity, unit, food)
-
- result = {
- "quantity": quantity,
- "unit": unit,
- "food": food.strip(),
- }
- if grams is not None:
- result["grams"] = round(grams, 1)
-
- return result
+def parse_with_spacy(text: str) -> dict:
+ """Parse ingredient using spaCy NLP.
+ Uses POS tagging to identify:
+ - NUM tokens → quantity
+ - Tokens that are known units → unit
+ - Remaining noun chunks → food
+ """
+ if not SPACY_AVAILABLE or not nlp:
+ return {"error": "spaCy not available", "text": text}
-def _parse_with_spacy(text: str) -> dict:
- """Use spaCy for more complex parsing when regex fails."""
doc = nlp(text)
- # Extract numbers
quantity = None
+ unit = None
+ food_tokens = []
+
+ # Track which tokens we've consumed
+ consumed = set()
+
+ # First pass: find numbers (quantity)
for token in doc:
- if token.like_num:
+ if token.like_num or token.pos_ == "NUM":
try:
- quantity = float(token.text)
+ # Handle word numbers like "one", "two"
+ if token.text.lower() in {
+ "one": 1,
+ "two": 2,
+ "three": 3,
+ "four": 4,
+ "five": 5,
+ "six": 6,
+ "seven": 7,
+ "eight": 8,
+ "nine": 9,
+ "ten": 10,
+ "dozen": 12,
+ }:
+ quantity = {
+ "one": 1,
+ "two": 2,
+ "three": 3,
+ "four": 4,
+ "five": 5,
+ "six": 6,
+ "seven": 7,
+ "eight": 8,
+ "nine": 9,
+ "ten": 10,
+ "dozen": 12,
+ }[token.text.lower()]
+ else:
+ quantity = float(token.text)
+ consumed.add(token.i)
break
except ValueError:
pass
- # Extract food (noun chunks)
- food = None
+ # Second pass: find unit (must be a known measurement unit)
+ for token in doc:
+ if token.i not in consumed and is_measurement_unit(token.text):
+ unit = token.text.lower()
+ consumed.add(token.i)
+ break
+
+ # Third pass: remaining nouns/noun chunks are the food
for chunk in doc.noun_chunks:
- food = chunk.text
- break
-
- if not food:
- # Fall back to last noun
- for token in reversed(doc):
- if token.pos_ == "NOUN":
- food = token.text
- break
+ # Skip chunks that only contain consumed tokens
+ chunk_tokens = [t for t in chunk if t.i not in consumed]
+ if chunk_tokens:
+ # Skip determiners like "a", "the"
+ food_text = " ".join(t.text for t in chunk_tokens if t.pos_ != "DET")
+ if food_text:
+ food_tokens.append(food_text)
+ for t in chunk_tokens:
+ consumed.add(t.i)
+
+ # If no noun chunks, fall back to individual nouns
+ if not food_tokens:
+ for token in doc:
+ if token.i not in consumed and token.pos_ in ("NOUN", "PROPN"):
+ food_tokens.append(token.text)
+ consumed.add(token.i)
+
+ # Also include adjectives that modify food (e.g., "brown sugar")
+ food_text = " ".join(food_tokens) if food_tokens else None
+
+ if not food_text:
+ return {"error": "Could not identify food", "text": text}
- if not food:
- return {"error": "Could not parse ingredient", "text": text}
+ unit = normalize_unit(unit)
+ grams = get_grams(quantity or 1.0, unit, food_text) if unit else None
- return {
+ result = {
"quantity": quantity or 1.0,
- "unit": None,
- "food": food,
- "parsed_by": "spacy",
+ "unit": unit,
+ "food": food_text,
}
+ if grams is not None:
+ result["grams"] = round(grams, 1)
+
+ return result
def parse_ingredient(text: str) -> dict | list[dict]:
Single ingredient: dict with quantity, unit, food, grams
Multiple ingredients: list of dicts
"""
- # Split on separators
- parts = SEPARATORS.split(text)
+ if not SPACY_AVAILABLE:
+ return {
+ "error": "spaCy required for parsing. Install with: pip install spacy && python -m spacy download en_core_web_sm"
+ }
+
+ # Use spaCy to split on conjunctions and punctuation intelligently
+ doc = nlp(text)
+
+ # Find split points (commas, "and" between ingredients)
+ parts = []
+ current_part = []
+
+ for token in doc:
+ # Split on comma or semicolon
+ if token.text in (",", ";"):
+ if current_part:
+ parts.append(" ".join(t.text for t in current_part))
+ current_part = []
+ # Split on "and" when preceded by word (not number) - "flour and sugar"
+ elif (
+ token.text.lower() == "and"
+ and current_part
+ and not current_part[-1].like_num
+ ):
+ parts.append(" ".join(t.text for t in current_part))
+ current_part = []
+ else:
+ current_part.append(token)
+
+ if current_part:
+ parts.append(" ".join(t.text for t in current_part))
+
+ # Filter empty parts
parts = [p.strip() for p in parts if p.strip()]
- if len(parts) == 1:
- return parse_single_ingredient(parts[0])
+ if len(parts) == 0:
+ return {"error": "No ingredients found", "text": text}
- results = []
- for part in parts:
- result = parse_single_ingredient(part)
- results.append(result)
+ if len(parts) == 1:
+ return parse_with_spacy(parts[0])
- return results
+ return [parse_with_spacy(part) for part in parts]
class FuzzyMatcher:
if not self.food_names:
return []
- # Use token_set_ratio for better matching of partial/reordered terms
matches = process.extract(
query,
self.food_names,
return results
def best_match(self, query: str) -> Optional[str]:
- """Get single best matching food name.
-
- Args:
- query: Parsed food name to match
-
- Returns:
- Best matching name or None if no good match
- """
+ """Get single best matching food name."""
matches = self.match(query, limit=1)
return matches[0]["name"] if matches else None
-
-
-# For backwards compatibility
-def parse_ingredient_legacy(text: str) -> dict:
- """Legacy single-ingredient parser (original API)."""
- result = parse_ingredient(text)
- if isinstance(result, list):
- return result[0] if result else {"error": "No ingredients found"}
- return result