From: Shane Jaroch <chown_tee@proton.me>
Date: Mon, 26 Jan 2026 16:09:09 +0000 (-0500)
Subject: wip
X-Git-Url: https://git.nutra.tk/v2?a=commitdiff_plain;h=378a576e487744cb5fa206d78079bc8dd5cb8b27;p=nutratech%2Fsearch-server.git

wip
---

diff --git a/pylang_serv/apis/openfoodfacts.py b/pylang_serv/apis/openfoodfacts.py
new file mode 100644
index 0000000..5a478be
--- /dev/null
+++ b/pylang_serv/apis/openfoodfacts.py
@@ -0,0 +1,99 @@
+"""Open Food Facts API client.
+
+API Documentation: https://openfoodfacts.github.io/openfoodfacts-server/api/
+"""
+
+from typing import Optional
+import requests
+
+BASE_URL = "https://world.openfoodfacts.org/api/v2"
+
+
+class OpenFoodFactsClient:
+    """Client for Open Food Facts API."""
+
+    def __init__(self):
+        self.session = requests.Session()
+        # User-Agent required by OFF API
+        self.session.headers.update(
+            {"User-Agent": "NutraApp/0.1 (https://github.com/nutratech/gui-qt)"}
+        )
+
+    def get_by_barcode(self, barcode: str) -> dict:
+        """Get product by barcode/UPC.
+
+        Args:
+            barcode: UPC or EAN barcode
+
+        Returns:
+            Product data or error
+        """
+        url = f"{BASE_URL}/product/{barcode}"
+
+        try:
+            response = self.session.get(url, timeout=10)
+            response.raise_for_status()
+            data = response.json()
+
+            if data.get("status") != 1:
+                return {"error": "Product not found", "barcode": barcode}
+
+            return self._normalize_product(data.get("product", {}))
+        except requests.RequestException as e:
+            return {"error": str(e)}
+
+    def search(self, query: str, page_size: int = 10) -> list[dict]:
+        """Search for products by name.
+
+        Args:
+            query: Search term
+            page_size: Number of results
+
+        Returns:
+            List of normalized products
+        """
+        url = f"{BASE_URL}/search"
+        params = {
+            "search_terms": query,
+            "page_size": page_size,
+            "json": 1,
+        }
+
+        try:
+            response = self.session.get(url, params=params, timeout=10)
+            response.raise_for_status()
+            data = response.json()
+
+            products = []
+            for product in data.get("products", []):
+                products.append(self._normalize_product(product))
+            return products
+        except requests.RequestException as e:
+            return [{"error": str(e)}]
+
+    def _normalize_product(self, product: dict) -> dict:
+        """Normalize OFF product to standard format.
+
+        Maps OFF nutriment keys to more standard names.
+        """
+        nutriments = product.get("nutriments", {})
+
+        return {
+            "barcode": product.get("code"),
+            "name": product.get("product_name", "Unknown"),
+            "brand": product.get("brands"),
+            "serving_size": product.get("serving_size"),
+            "nutriscore": product.get("nutriscore_grade"),
+            "nova_group": product.get("nova_group"),
+            "nutrients": {
+                "energy_kcal": nutriments.get("energy-kcal_100g"),
+                "fat": nutriments.get("fat_100g"),
+                "saturated_fat": nutriments.get("saturated-fat_100g"),
+                "carbohydrates": nutriments.get("carbohydrates_100g"),
+                "sugars": nutriments.get("sugars_100g"),
+                "fiber": nutriments.get("fiber_100g"),
+                "proteins": nutriments.get("proteins_100g"),
+                "sodium": nutriments.get("sodium_100g"),
+                "salt": nutriments.get("salt_100g"),
+            },
+        }
diff --git a/pylang_serv/apis/usda.py b/pylang_serv/apis/usda.py
new file mode 100644
index 0000000..efa9c7d
--- /dev/null
+++ b/pylang_serv/apis/usda.py
@@ -0,0 +1,90 @@
+"""USDA FoodData Central API client.
+
+API Documentation: https://fdc.nal.usda.gov/api-guide.html
+"""
+
+from typing import Optional
+import requests
+
+# Free API key (demo key has rate limits)
+# Users should get their own at https://fdc.nal.usda.gov/api-key-signup.html
+DEFAULT_API_KEY = "DEMO_KEY"
+
+BASE_URL = "https://api.nal.usda.gov/fdc/v1"
+
+
+class USDAClient:
+    """Client for USDA FoodData Central API."""
+
+    def __init__(self, api_key: Optional[str] = None):
+        self.api_key = api_key or DEFAULT_API_KEY
+
+    def search(self, query: str, page_size: int = 10) -> dict:
+        """Search for foods by keyword.
+
+        Args:
+            query: Search term (e.g., "chicken breast")
+            page_size: Number of results to return
+
+        Returns:
+            API response with foods array containing fdcId, description, etc.
+        """
+        url = f"{BASE_URL}/foods/search"
+        params = {
+            "api_key": self.api_key,
+            "query": query,
+            "pageSize": page_size,
+            "dataType": ["Foundation", "SR Legacy", "Branded"],
+        }
+
+        try:
+            response = requests.get(url, params=params, timeout=10)
+            response.raise_for_status()
+            return response.json()
+        except requests.RequestException as e:
+            return {"error": str(e)}
+
+    def get_food(self, fdc_id: int) -> dict:
+        """Get detailed food information by FDC ID.
+
+        Args:
+            fdc_id: FoodData Central ID
+
+        Returns:
+            Full food data including nutrients
+        """
+        url = f"{BASE_URL}/food/{fdc_id}"
+        params = {"api_key": self.api_key}
+
+        try:
+            response = requests.get(url, params=params, timeout=10)
+            response.raise_for_status()
+            return response.json()
+        except requests.RequestException as e:
+            return {"error": str(e)}
+
+    def get_nutrients(self, fdc_id: int) -> list[dict]:
+        """Get nutrients for a food.
+
+        Args:
+            fdc_id: FoodData Central ID
+
+        Returns:
+            List of nutrients with id, name, amount, unit
+        """
+        data = self.get_food(fdc_id)
+        if "error" in data:
+            return []
+
+        nutrients = []
+        for nutrient in data.get("foodNutrients", []):
+            n = nutrient.get("nutrient", {})
+            nutrients.append(
+                {
+                    "id": n.get("id"),
+                    "name": n.get("name"),
+                    "amount": nutrient.get("amount"),
+                    "unit": n.get("unitName"),
+                }
+            )
+        return nutrients
diff --git a/pylang_serv/parser.py b/pylang_serv/parser.py
index 9f901c4..d8a9321 100644
--- a/pylang_serv/parser.py
+++ b/pylang_serv/parser.py
@@ -1,17 +1,36 @@
-"""Natural language ingredient parser.
+"""Natural language ingredient parser with advanced NLP features.
 
-Parses strings like "2 cups flour" into structured data:
-{"quantity": 2.0, "unit": "cup", "food": "flour", "grams": 250.0}
+Features:
+- Multi-ingredient parsing ("2 cups flour and 1 tsp salt")
+- Fuzzy food matching to database
+- Optional spaCy integration for robust parsing
 """
 
 import re
 from typing import Optional
 
 import pint
+from rapidfuzz import fuzz, process
 
 # Unit registry for conversions
 ureg = pint.UnitRegistry()
 
+# Try to load spaCy (optional)
+try:
+    # TODO: log warning, give some user feedback in status bar
+    import spacy
+
+    try:
+        nlp = spacy.load("en_core_web_sm")
+        SPACY_AVAILABLE = True
+    except OSError:
+        nlp = None
+        SPACY_AVAILABLE = False
+except ImportError:
+    nlp = None
+    SPACY_AVAILABLE = False
+
+
 # Common cooking unit aliases
 UNIT_ALIASES = {
     "tbsp": "tablespoon",
@@ -30,7 +49,6 @@ UNIT_ALIASES = {
 }
 
 # Approximate density conversions (grams per cup) for common ingredients
-# Used when converting volume to weight
 DENSITY_MAP = {
     "flour": 125,
     "all-purpose flour": 125,
@@ -45,17 +63,62 @@ DENSITY_MAP = {
     "honey": 340,
     "oil": 218,
     "salt": 288,
-    # Default for unknown foods
     "_default": 150,
 }
 
+# Count nouns that look like units but are actually foods
+# These should be treated as food, not unit (e.g., "3 eggs" â food="eggs", not unit="egg")
+COUNT_NOUNS = {
+    "egg",
+    "eggs",
+    "apple",
+    "apples",
+    "banana",
+    "bananas",
+    "orange",
+    "oranges",
+    "clove",
+    "cloves",
+    "slice",
+    "slices",
+    "piece",
+    "pieces",
+    "stick",
+    "sticks",
+    "head",
+    "heads",
+    "bunch",
+    "bunches",
+    "sprig",
+    "sprigs",
+    "leaf",
+    "leaves",
+    "can",
+    "cans",
+    "package",
+    "packages",
+    "bag",
+    "bags",
+    "box",
+    "boxes",
+    "bottle",
+    "bottles",
+    "jar",
+    "jars",
+}
+
+# Ingredient separators for multi-ingredient parsing
+SEPARATORS = re.compile(
+    r"\s*(?:,\s*(?:and\s+)?|(?<!\d)\s+and\s+|\s*;\s*)\s*", re.IGNORECASE
+)
+
 # Pattern to match: [quantity] [unit] [of] [food]
 INGREDIENT_PATTERN = re.compile(
     r"^\s*"
-    r"(?P<quantity>[\d./]+(?:\s*[\d./]+)?)\s*"  # quantity (e.g., "2", "1/2", "1 1/2")
+    r"(?P<quantity>[\d./]+(?:\s+[\d./]+)?)\s*"  # quantity
     r"(?P<unit>[a-zA-Z]+)?\s*"  # optional unit
     r"(?:of\s+)?"  # optional "of"
-    r"(?P<food>.+?)\s*$",  # food name
+    r"(?P<food>.+?)\s*$",
     re.IGNORECASE,
 )
 
@@ -66,7 +129,6 @@ def parse_fraction(s: str) -> float:
     parts = s.split()
 
     if len(parts) == 2:
-        # Mixed number like "1 1/2"
         whole = float(parts[0])
         frac = parse_fraction(parts[1])
         return whole + frac
@@ -82,7 +144,7 @@ def normalize_unit(unit: Optional[str]) -> Optional[str]:
     """Normalize unit to standard form."""
     if not unit:
         return None
-    unit = unit.lower().rstrip("s")  # Remove plural 's'
+    unit = unit.lower().rstrip("s")
     return UNIT_ALIASES.get(unit, unit)
 
 
@@ -95,23 +157,19 @@ def get_grams(quantity: float, unit: Optional[str], food: str) -> Optional[float
     if not unit:
         return None
 
-    # If already in grams
     if unit == "gram":
         return quantity
 
     try:
-        # Try direct weight conversion
         q = quantity * ureg(unit)
         return q.to("gram").magnitude
     except (pint.UndefinedUnitError, pint.DimensionalityError):
         pass
 
-    # Volume to weight conversion using density
     food_lower = food.lower()
     density = DENSITY_MAP.get(food_lower, DENSITY_MAP["_default"])
 
     try:
-        # Convert to cups first, then multiply by density
         q = quantity * ureg(unit)
         cups = q.to("cup").magnitude
         return cups * density
@@ -119,17 +177,17 @@ def get_grams(quantity: float, unit: Optional[str], food: str) -> Optional[float
         return None
 
 
-def parse_ingredient(text: str) -> dict:
-    """Parse an ingredient string into structured data.
-
-    Args:
-        text: Natural language ingredient string, e.g., "2 cups flour"
+def parse_single_ingredient(text: str) -> dict:
+    """Parse a single ingredient string into structured data."""
+    text = text.strip()
+    if not text:
+        return {"error": "Empty input"}
 
-    Returns:
-        Dict with keys: quantity, unit, food, grams (optional)
-    """
     match = INGREDIENT_PATTERN.match(text)
     if not match:
+        # Try spaCy if available for difficult cases
+        if SPACY_AVAILABLE and nlp:
+            return _parse_with_spacy(text)
         return {"error": "Could not parse ingredient", "text": text}
 
     quantity_str = match.group("quantity")
@@ -141,6 +199,12 @@ def parse_ingredient(text: str) -> dict:
     except (ValueError, ZeroDivisionError):
         return {"error": f"Invalid quantity: {quantity_str}", "text": text}
 
+    # Check if "unit" is actually a count noun (e.g., "3 eggs" â unit="egg", food="s")
+    # In that case, merge unit back into food
+    if unit and unit.lower() in COUNT_NOUNS:
+        food = unit + (" " + food if food else "")
+        unit = None
+
     unit = normalize_unit(unit)
     grams = get_grams(quantity, unit, food)
 
@@ -153,3 +217,131 @@ def parse_ingredient(text: str) -> dict:
         result["grams"] = round(grams, 1)
 
     return result
+
+
+def _parse_with_spacy(text: str) -> dict:
+    """Use spaCy for more complex parsing when regex fails."""
+    doc = nlp(text)
+
+    # Extract numbers
+    quantity = None
+    for token in doc:
+        if token.like_num:
+            try:
+                quantity = float(token.text)
+                break
+            except ValueError:
+                pass
+
+    # Extract food (noun chunks)
+    food = None
+    for chunk in doc.noun_chunks:
+        food = chunk.text
+        break
+
+    if not food:
+        # Fall back to last noun
+        for token in reversed(doc):
+            if token.pos_ == "NOUN":
+                food = token.text
+                break
+
+    if not food:
+        return {"error": "Could not parse ingredient", "text": text}
+
+    return {
+        "quantity": quantity or 1.0,
+        "unit": None,
+        "food": food,
+        "parsed_by": "spacy",
+    }
+
+
+def parse_ingredient(text: str) -> dict | list[dict]:
+    """Parse ingredient text, handling multiple ingredients.
+
+    Args:
+        text: Natural language ingredient string
+              Single: "2 cups flour"
+              Multiple: "2 cups flour, 1 tsp salt, and 3 eggs"
+
+    Returns:
+        Single ingredient: dict with quantity, unit, food, grams
+        Multiple ingredients: list of dicts
+    """
+    # Split on separators
+    parts = SEPARATORS.split(text)
+    parts = [p.strip() for p in parts if p.strip()]
+
+    if len(parts) == 1:
+        return parse_single_ingredient(parts[0])
+
+    results = []
+    for part in parts:
+        result = parse_single_ingredient(part)
+        results.append(result)
+
+    return results
+
+
+class FuzzyMatcher:
+    """Fuzzy matcher for connecting parsed foods to database entries."""
+
+    def __init__(self, food_names: list[str], min_score: int = 70):
+        """Initialize with list of food names from database.
+
+        Args:
+            food_names: List of food names to match against
+            min_score: Minimum fuzzy match score (0-100)
+        """
+        self.food_names = food_names
+        self.min_score = min_score
+
+    def match(self, query: str, limit: int = 5) -> list[dict]:
+        """Find closest matching foods.
+
+        Args:
+            query: Parsed food name to match
+            limit: Max number of matches to return
+
+        Returns:
+            List of {name, score} dicts sorted by score descending
+        """
+        if not self.food_names:
+            return []
+
+        # Use token_set_ratio for better matching of partial/reordered terms
+        matches = process.extract(
+            query,
+            self.food_names,
+            scorer=fuzz.token_set_ratio,
+            limit=limit,
+        )
+
+        results = []
+        for name, score, _idx in matches:
+            if score >= self.min_score:
+                results.append({"name": name, "score": score})
+
+        return results
+
+    def best_match(self, query: str) -> Optional[str]:
+        """Get single best matching food name.
+
+        Args:
+            query: Parsed food name to match
+
+        Returns:
+            Best matching name or None if no good match
+        """
+        matches = self.match(query, limit=1)
+        return matches[0]["name"] if matches else None
+
+
+# For backwards compatibility
+def parse_ingredient_legacy(text: str) -> dict:
+    """Legacy single-ingredient parser (original API)."""
+    result = parse_ingredient(text)
+    if isinstance(result, list):
+        return result[0] if result else {"error": "No ingredients found"}
+    return result
diff --git a/pyproject.toml b/pyproject.toml
index a7a6e7d..edf593d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,6 +13,7 @@ dependencies = [
     "flask>=3.0",
     "requests>=2.28",
     "pint>=0.23",
+    "rapidfuzz>=3.0",
 ]
 
 [project.optional-dependencies]