From: Shane Jaroch Date: Mon, 26 Jan 2026 16:09:09 +0000 (-0500) Subject: wip X-Git-Url: https://git.nutra.tk/v2?a=commitdiff_plain;h=378a576e487744cb5fa206d78079bc8dd5cb8b27;p=nutratech%2Fsearch-server.git wip --- diff --git a/pylang_serv/apis/openfoodfacts.py b/pylang_serv/apis/openfoodfacts.py new file mode 100644 index 0000000..5a478be --- /dev/null +++ b/pylang_serv/apis/openfoodfacts.py @@ -0,0 +1,99 @@ +"""Open Food Facts API client. + +API Documentation: https://openfoodfacts.github.io/openfoodfacts-server/api/ +""" + +from typing import Optional +import requests + +BASE_URL = "https://world.openfoodfacts.org/api/v2" + + +class OpenFoodFactsClient: + """Client for Open Food Facts API.""" + + def __init__(self): + self.session = requests.Session() + # User-Agent required by OFF API + self.session.headers.update( + {"User-Agent": "NutraApp/0.1 (https://github.com/nutratech/gui-qt)"} + ) + + def get_by_barcode(self, barcode: str) -> dict: + """Get product by barcode/UPC. + + Args: + barcode: UPC or EAN barcode + + Returns: + Product data or error + """ + url = f"{BASE_URL}/product/{barcode}" + + try: + response = self.session.get(url, timeout=10) + response.raise_for_status() + data = response.json() + + if data.get("status") != 1: + return {"error": "Product not found", "barcode": barcode} + + return self._normalize_product(data.get("product", {})) + except requests.RequestException as e: + return {"error": str(e)} + + def search(self, query: str, page_size: int = 10) -> list[dict]: + """Search for products by name. + + Args: + query: Search term + page_size: Number of results + + Returns: + List of normalized products + """ + url = f"{BASE_URL}/search" + params = { + "search_terms": query, + "page_size": page_size, + "json": 1, + } + + try: + response = self.session.get(url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + products = [] + for product in data.get("products", []): + products.append(self._normalize_product(product)) + return products + except requests.RequestException as e: + return [{"error": str(e)}] + + def _normalize_product(self, product: dict) -> dict: + """Normalize OFF product to standard format. + + Maps OFF nutriment keys to more standard names. + """ + nutriments = product.get("nutriments", {}) + + return { + "barcode": product.get("code"), + "name": product.get("product_name", "Unknown"), + "brand": product.get("brands"), + "serving_size": product.get("serving_size"), + "nutriscore": product.get("nutriscore_grade"), + "nova_group": product.get("nova_group"), + "nutrients": { + "energy_kcal": nutriments.get("energy-kcal_100g"), + "fat": nutriments.get("fat_100g"), + "saturated_fat": nutriments.get("saturated-fat_100g"), + "carbohydrates": nutriments.get("carbohydrates_100g"), + "sugars": nutriments.get("sugars_100g"), + "fiber": nutriments.get("fiber_100g"), + "proteins": nutriments.get("proteins_100g"), + "sodium": nutriments.get("sodium_100g"), + "salt": nutriments.get("salt_100g"), + }, + } diff --git a/pylang_serv/apis/usda.py b/pylang_serv/apis/usda.py new file mode 100644 index 0000000..efa9c7d --- /dev/null +++ b/pylang_serv/apis/usda.py @@ -0,0 +1,90 @@ +"""USDA FoodData Central API client. + +API Documentation: https://fdc.nal.usda.gov/api-guide.html +""" + +from typing import Optional +import requests + +# Free API key (demo key has rate limits) +# Users should get their own at https://fdc.nal.usda.gov/api-key-signup.html +DEFAULT_API_KEY = "DEMO_KEY" + +BASE_URL = "https://api.nal.usda.gov/fdc/v1" + + +class USDAClient: + """Client for USDA FoodData Central API.""" + + def __init__(self, api_key: Optional[str] = None): + self.api_key = api_key or DEFAULT_API_KEY + + def search(self, query: str, page_size: int = 10) -> dict: + """Search for foods by keyword. + + Args: + query: Search term (e.g., "chicken breast") + page_size: Number of results to return + + Returns: + API response with foods array containing fdcId, description, etc. + """ + url = f"{BASE_URL}/foods/search" + params = { + "api_key": self.api_key, + "query": query, + "pageSize": page_size, + "dataType": ["Foundation", "SR Legacy", "Branded"], + } + + try: + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + return response.json() + except requests.RequestException as e: + return {"error": str(e)} + + def get_food(self, fdc_id: int) -> dict: + """Get detailed food information by FDC ID. + + Args: + fdc_id: FoodData Central ID + + Returns: + Full food data including nutrients + """ + url = f"{BASE_URL}/food/{fdc_id}" + params = {"api_key": self.api_key} + + try: + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + return response.json() + except requests.RequestException as e: + return {"error": str(e)} + + def get_nutrients(self, fdc_id: int) -> list[dict]: + """Get nutrients for a food. + + Args: + fdc_id: FoodData Central ID + + Returns: + List of nutrients with id, name, amount, unit + """ + data = self.get_food(fdc_id) + if "error" in data: + return [] + + nutrients = [] + for nutrient in data.get("foodNutrients", []): + n = nutrient.get("nutrient", {}) + nutrients.append( + { + "id": n.get("id"), + "name": n.get("name"), + "amount": nutrient.get("amount"), + "unit": n.get("unitName"), + } + ) + return nutrients diff --git a/pylang_serv/parser.py b/pylang_serv/parser.py index 9f901c4..d8a9321 100644 --- a/pylang_serv/parser.py +++ b/pylang_serv/parser.py @@ -1,17 +1,36 @@ -"""Natural language ingredient parser. +"""Natural language ingredient parser with advanced NLP features. -Parses strings like "2 cups flour" into structured data: -{"quantity": 2.0, "unit": "cup", "food": "flour", "grams": 250.0} +Features: +- Multi-ingredient parsing ("2 cups flour and 1 tsp salt") +- Fuzzy food matching to database +- Optional spaCy integration for robust parsing """ import re from typing import Optional import pint +from rapidfuzz import fuzz, process # Unit registry for conversions ureg = pint.UnitRegistry() +# Try to load spaCy (optional) +try: + # TODO: log warning, give some user feedback in status bar + import spacy + + try: + nlp = spacy.load("en_core_web_sm") + SPACY_AVAILABLE = True + except OSError: + nlp = None + SPACY_AVAILABLE = False +except ImportError: + nlp = None + SPACY_AVAILABLE = False + + # Common cooking unit aliases UNIT_ALIASES = { "tbsp": "tablespoon", @@ -30,7 +49,6 @@ UNIT_ALIASES = { } # Approximate density conversions (grams per cup) for common ingredients -# Used when converting volume to weight DENSITY_MAP = { "flour": 125, "all-purpose flour": 125, @@ -45,17 +63,62 @@ DENSITY_MAP = { "honey": 340, "oil": 218, "salt": 288, - # Default for unknown foods "_default": 150, } +# Count nouns that look like units but are actually foods +# These should be treated as food, not unit (e.g., "3 eggs" → food="eggs", not unit="egg") +COUNT_NOUNS = { + "egg", + "eggs", + "apple", + "apples", + "banana", + "bananas", + "orange", + "oranges", + "clove", + "cloves", + "slice", + "slices", + "piece", + "pieces", + "stick", + "sticks", + "head", + "heads", + "bunch", + "bunches", + "sprig", + "sprigs", + "leaf", + "leaves", + "can", + "cans", + "package", + "packages", + "bag", + "bags", + "box", + "boxes", + "bottle", + "bottles", + "jar", + "jars", +} + +# Ingredient separators for multi-ingredient parsing +SEPARATORS = re.compile( + r"\s*(?:,\s*(?:and\s+)?|(?[\d./]+(?:\s*[\d./]+)?)\s*" # quantity (e.g., "2", "1/2", "1 1/2") + r"(?P[\d./]+(?:\s+[\d./]+)?)\s*" # quantity r"(?P[a-zA-Z]+)?\s*" # optional unit r"(?:of\s+)?" # optional "of" - r"(?P.+?)\s*$", # food name + r"(?P.+?)\s*$", re.IGNORECASE, ) @@ -66,7 +129,6 @@ def parse_fraction(s: str) -> float: parts = s.split() if len(parts) == 2: - # Mixed number like "1 1/2" whole = float(parts[0]) frac = parse_fraction(parts[1]) return whole + frac @@ -82,7 +144,7 @@ def normalize_unit(unit: Optional[str]) -> Optional[str]: """Normalize unit to standard form.""" if not unit: return None - unit = unit.lower().rstrip("s") # Remove plural 's' + unit = unit.lower().rstrip("s") return UNIT_ALIASES.get(unit, unit) @@ -95,23 +157,19 @@ def get_grams(quantity: float, unit: Optional[str], food: str) -> Optional[float if not unit: return None - # If already in grams if unit == "gram": return quantity try: - # Try direct weight conversion q = quantity * ureg(unit) return q.to("gram").magnitude except (pint.UndefinedUnitError, pint.DimensionalityError): pass - # Volume to weight conversion using density food_lower = food.lower() density = DENSITY_MAP.get(food_lower, DENSITY_MAP["_default"]) try: - # Convert to cups first, then multiply by density q = quantity * ureg(unit) cups = q.to("cup").magnitude return cups * density @@ -119,17 +177,17 @@ def get_grams(quantity: float, unit: Optional[str], food: str) -> Optional[float return None -def parse_ingredient(text: str) -> dict: - """Parse an ingredient string into structured data. - - Args: - text: Natural language ingredient string, e.g., "2 cups flour" +def parse_single_ingredient(text: str) -> dict: + """Parse a single ingredient string into structured data.""" + text = text.strip() + if not text: + return {"error": "Empty input"} - Returns: - Dict with keys: quantity, unit, food, grams (optional) - """ match = INGREDIENT_PATTERN.match(text) if not match: + # Try spaCy if available for difficult cases + if SPACY_AVAILABLE and nlp: + return _parse_with_spacy(text) return {"error": "Could not parse ingredient", "text": text} quantity_str = match.group("quantity") @@ -141,6 +199,12 @@ def parse_ingredient(text: str) -> dict: except (ValueError, ZeroDivisionError): return {"error": f"Invalid quantity: {quantity_str}", "text": text} + # Check if "unit" is actually a count noun (e.g., "3 eggs" → unit="egg", food="s") + # In that case, merge unit back into food + if unit and unit.lower() in COUNT_NOUNS: + food = unit + (" " + food if food else "") + unit = None + unit = normalize_unit(unit) grams = get_grams(quantity, unit, food) @@ -153,3 +217,131 @@ def parse_ingredient(text: str) -> dict: result["grams"] = round(grams, 1) return result + + +def _parse_with_spacy(text: str) -> dict: + """Use spaCy for more complex parsing when regex fails.""" + doc = nlp(text) + + # Extract numbers + quantity = None + for token in doc: + if token.like_num: + try: + quantity = float(token.text) + break + except ValueError: + pass + + # Extract food (noun chunks) + food = None + for chunk in doc.noun_chunks: + food = chunk.text + break + + if not food: + # Fall back to last noun + for token in reversed(doc): + if token.pos_ == "NOUN": + food = token.text + break + + if not food: + return {"error": "Could not parse ingredient", "text": text} + + return { + "quantity": quantity or 1.0, + "unit": None, + "food": food, + "parsed_by": "spacy", + } + + +def parse_ingredient(text: str) -> dict | list[dict]: + """Parse ingredient text, handling multiple ingredients. + + Args: + text: Natural language ingredient string + Single: "2 cups flour" + Multiple: "2 cups flour, 1 tsp salt, and 3 eggs" + + Returns: + Single ingredient: dict with quantity, unit, food, grams + Multiple ingredients: list of dicts + """ + # Split on separators + parts = SEPARATORS.split(text) + parts = [p.strip() for p in parts if p.strip()] + + if len(parts) == 1: + return parse_single_ingredient(parts[0]) + + results = [] + for part in parts: + result = parse_single_ingredient(part) + results.append(result) + + return results + + +class FuzzyMatcher: + """Fuzzy matcher for connecting parsed foods to database entries.""" + + def __init__(self, food_names: list[str], min_score: int = 70): + """Initialize with list of food names from database. + + Args: + food_names: List of food names to match against + min_score: Minimum fuzzy match score (0-100) + """ + self.food_names = food_names + self.min_score = min_score + + def match(self, query: str, limit: int = 5) -> list[dict]: + """Find closest matching foods. + + Args: + query: Parsed food name to match + limit: Max number of matches to return + + Returns: + List of {name, score} dicts sorted by score descending + """ + if not self.food_names: + return [] + + # Use token_set_ratio for better matching of partial/reordered terms + matches = process.extract( + query, + self.food_names, + scorer=fuzz.token_set_ratio, + limit=limit, + ) + + results = [] + for name, score, _idx in matches: + if score >= self.min_score: + results.append({"name": name, "score": score}) + + return results + + def best_match(self, query: str) -> Optional[str]: + """Get single best matching food name. + + Args: + query: Parsed food name to match + + Returns: + Best matching name or None if no good match + """ + matches = self.match(query, limit=1) + return matches[0]["name"] if matches else None + + +# For backwards compatibility +def parse_ingredient_legacy(text: str) -> dict: + """Legacy single-ingredient parser (original API).""" + result = parse_ingredient(text) + if isinstance(result, list): + return result[0] if result else {"error": "No ingredients found"} + return result diff --git a/pyproject.toml b/pyproject.toml index a7a6e7d..edf593d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ "flask>=3.0", "requests>=2.28", "pint>=0.23", + "rapidfuzz>=3.0", ] [project.optional-dependencies]