-"""Natural language ingredient parser with spaCy NLP.
+"""Natural language ingredient parser.
-Uses spaCy for:
-- Part-of-speech tagging to distinguish units from foods
-- Noun chunk extraction for food names
-- Number detection (including word numbers like "one")
-- Multi-ingredient sentence splitting
+Features:
+- Multi-ingredient parsing ("2 cups flour and 1 tsp salt")
+- Fuzzy food matching to database
+- Robust regex-based parsing (no spaCy dependency required)
+- Optional spaCy integration for enhanced parsing (if available)
"""
import re
# Unit registry for conversions
ureg = pint.UnitRegistry()
-# Try to load spaCy
+# Try to load spaCy (optional, may not work on Python 3.14+)
+SPACY_AVAILABLE = False
+nlp = None
try:
import spacy
- try:
- nlp = spacy.load("en_core_web_sm")
- SPACY_AVAILABLE = True
- except OSError:
- nlp = None
- SPACY_AVAILABLE = False
- print(
- "Warning: spaCy model 'en_core_web_sm' not found. "
- "Run: python -m spacy download en_core_web_sm"
- )
-except ImportError:
- nlp = None
- SPACY_AVAILABLE = False
- print("Warning: spaCy not installed. Run: pip install spacy")
+ nlp = spacy.load("en_core_web_sm")
+ SPACY_AVAILABLE = True
+except Exception:
+ pass # Fall back to regex parsing
# Measurement units (recognized by pint or common cooking)
"milliliter",
"milliliters",
"ml",
- "fluid ounce",
- "fluid ounces",
- "fl oz",
"pint",
"pints",
"pt",
"l": "liter",
}
+# Word numbers to float
+WORD_NUMBERS = {
+ "one": 1,
+ "two": 2,
+ "three": 3,
+ "four": 4,
+ "five": 5,
+ "six": 6,
+ "seven": 7,
+ "eight": 8,
+ "nine": 9,
+ "ten": 10,
+ "eleven": 11,
+ "twelve": 12,
+ "dozen": 12,
+ "half": 0.5,
+ "quarter": 0.25,
+ "a": 1,
+ "an": 1,
+}
+
# Density map for volume-to-weight conversion
DENSITY_MAP = {
"flour": 125,
"_default": 150,
}
+# Regex patterns
+FRACTION_PATTERN = re.compile(r"(\d+)\s*/\s*(\d+)")
+MIXED_NUMBER_PATTERN = re.compile(r"(\d+)\s+(\d+)\s*/\s*(\d+)")
+NUMBER_PATTERN = re.compile(r"\d+\.?\d*")
+
+# Ingredient separators
+SEPARATORS = re.compile(
+ r"\s*(?:,\s*(?:and\s+)?|(?<!\d)\s+and\s+|\s*;\s*)\s*", re.IGNORECASE
+)
+
def normalize_unit(unit: Optional[str]) -> Optional[str]:
"""Normalize unit to standard form."""
return word.lower() in KNOWN_UNITS
+def parse_quantity(text: str) -> tuple[Optional[float], str]:
+ """Extract quantity from start of text.
+
+ Returns:
+ Tuple of (quantity, remaining_text)
+ """
+ text = text.strip()
+
+ # Try mixed number first: "1 1/2"
+ match = MIXED_NUMBER_PATTERN.match(text)
+ if match:
+ whole = float(match.group(1))
+ num = float(match.group(2))
+ denom = float(match.group(3))
+ return whole + num / denom, text[match.end() :].strip()
+
+ # Try fraction: "1/2"
+ match = FRACTION_PATTERN.match(text)
+ if match:
+ num = float(match.group(1))
+ denom = float(match.group(2))
+ return num / denom, text[match.end() :].strip()
+
+ # Try regular number
+ match = NUMBER_PATTERN.match(text)
+ if match:
+ return float(match.group()), text[match.end() :].strip()
+
+ # Try word number
+ words = text.split()
+ if words:
+ first_word = words[0].lower()
+ if first_word in WORD_NUMBERS:
+ return WORD_NUMBERS[first_word], " ".join(words[1:])
+
+ return None, text
+
+
def get_grams(quantity: float, unit: Optional[str], food: str) -> Optional[float]:
"""Convert quantity + unit to grams if possible."""
if not unit:
return None
-def parse_with_spacy(text: str) -> dict:
- """Parse ingredient using spaCy NLP.
-
- Uses POS tagging to identify:
- - NUM tokens → quantity
- - Tokens that are known units → unit
- - Remaining noun chunks → food
- """
- if not SPACY_AVAILABLE or not nlp:
- return {"error": "spaCy not available", "text": text}
+def parse_single_ingredient(text: str) -> dict:
+ """Parse a single ingredient string into structured data."""
+ text = text.strip()
+ if not text:
+ return {"error": "Empty input"}
- doc = nlp(text)
+ # Extract quantity
+ quantity, remaining = parse_quantity(text)
- quantity = None
+ # Extract unit (if the next word is a known unit)
unit = None
- food_tokens = []
-
- # Track which tokens we've consumed
- consumed = set()
-
- # First pass: find numbers (quantity)
- for token in doc:
- if token.like_num or token.pos_ == "NUM":
- try:
- # Handle word numbers like "one", "two"
- if token.text.lower() in {
- "one": 1,
- "two": 2,
- "three": 3,
- "four": 4,
- "five": 5,
- "six": 6,
- "seven": 7,
- "eight": 8,
- "nine": 9,
- "ten": 10,
- "dozen": 12,
- }:
- quantity = {
- "one": 1,
- "two": 2,
- "three": 3,
- "four": 4,
- "five": 5,
- "six": 6,
- "seven": 7,
- "eight": 8,
- "nine": 9,
- "ten": 10,
- "dozen": 12,
- }[token.text.lower()]
- else:
- quantity = float(token.text)
- consumed.add(token.i)
- break
- except ValueError:
- pass
-
- # Second pass: find unit (must be a known measurement unit)
- for token in doc:
- if token.i not in consumed and is_measurement_unit(token.text):
- unit = token.text.lower()
- consumed.add(token.i)
- break
-
- # Third pass: remaining nouns/noun chunks are the food
- for chunk in doc.noun_chunks:
- # Skip chunks that only contain consumed tokens
- chunk_tokens = [t for t in chunk if t.i not in consumed]
- if chunk_tokens:
- # Skip determiners like "a", "the"
- food_text = " ".join(t.text for t in chunk_tokens if t.pos_ != "DET")
- if food_text:
- food_tokens.append(food_text)
- for t in chunk_tokens:
- consumed.add(t.i)
-
- # If no noun chunks, fall back to individual nouns
- if not food_tokens:
- for token in doc:
- if token.i not in consumed and token.pos_ in ("NOUN", "PROPN"):
- food_tokens.append(token.text)
- consumed.add(token.i)
-
- # Also include adjectives that modify food (e.g., "brown sugar")
- food_text = " ".join(food_tokens) if food_tokens else None
-
- if not food_text:
+ words = remaining.split()
+ if words and is_measurement_unit(words[0]):
+ unit = words[0]
+ remaining = " ".join(words[1:])
+
+ # Remove common filler words
+ remaining = re.sub(r"^(of|the)\s+", "", remaining, flags=re.IGNORECASE)
+
+ food = remaining.strip()
+
+ if not food:
return {"error": "Could not identify food", "text": text}
unit = normalize_unit(unit)
- grams = get_grams(quantity or 1.0, unit, food_text) if unit else None
+ grams = get_grams(quantity or 1.0, unit, food) if unit else None
result = {
"quantity": quantity or 1.0,
"unit": unit,
- "food": food_text,
+ "food": food,
}
if grams is not None:
result["grams"] = round(grams, 1)
Single ingredient: dict with quantity, unit, food, grams
Multiple ingredients: list of dicts
"""
- if not SPACY_AVAILABLE:
- return {
- "error": "spaCy required for parsing. Install with: pip install spacy && python -m spacy download en_core_web_sm"
- }
-
- # Use spaCy to split on conjunctions and punctuation intelligently
- doc = nlp(text)
-
- # Find split points (commas, "and" between ingredients)
- parts = []
- current_part = []
-
- for token in doc:
- # Split on comma or semicolon
- if token.text in (",", ";"):
- if current_part:
- parts.append(" ".join(t.text for t in current_part))
- current_part = []
- # Split on "and" when preceded by word (not number) - "flour and sugar"
- elif (
- token.text.lower() == "and"
- and current_part
- and not current_part[-1].like_num
- ):
- parts.append(" ".join(t.text for t in current_part))
- current_part = []
- else:
- current_part.append(token)
-
- if current_part:
- parts.append(" ".join(t.text for t in current_part))
-
- # Filter empty parts
+ # Split on separators
+ parts = SEPARATORS.split(text)
parts = [p.strip() for p in parts if p.strip()]
if len(parts) == 0:
return {"error": "No ingredients found", "text": text}
if len(parts) == 1:
- return parse_with_spacy(parts[0])
+ return parse_single_ingredient(parts[0])
- return [parse_with_spacy(part) for part in parts]
+ return [parse_single_ingredient(part) for part in parts]
class FuzzyMatcher: