import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
import re

# Fichiers & dossiers
URL_FILE = "urls_produits.txt"
OUTPUT_DIR_TXT = "produits_txt"
OUTPUT_DIR_JSON = "produits_json"
IMAGE_DIR = "images"

# Création des dossiers
os.makedirs(OUTPUT_DIR_TXT, exist_ok=True)
os.makedirs(OUTPUT_DIR_JSON, exist_ok=True)
os.makedirs(IMAGE_DIR, exist_ok=True)

headers = {
    "User-Agent": "Mozilla/5.0"
}

def extract_jsonld_data(soup):
    scripts = soup.find_all("script", type="application/ld+json")
    for script in scripts:
        try:
            json_content = json.loads(script.string.strip())
            if isinstance(json_content, dict):
                json_content = [json_content]
            for entry in json_content:
                if isinstance(entry, dict) and entry.get("@type") == "Product":
                    data = {
                        "name": entry.get("name"),
                        "image": entry.get("image")[0] if isinstance(entry.get("image"), list) else entry.get("image"),
                        "description": entry.get("description"),
                        "sku": entry.get("sku"),
                        "mpn": entry.get("mpn"),
                        "brand": entry.get("brand", {}).get("name") if isinstance(entry.get("brand"), dict) else None,
                        "price": entry.get("offers", {}).get("price") if isinstance(entry.get("offers"), dict) else None
                    }
                    return data
        except Exception:
            continue
    return {}

# Chargement des URLs
with open(URL_FILE, "r") as f:
    urls = [line.strip() for line in f if line.strip()]

# Boucle sur les URLs
for url in urls:
    print(f"Traitement de : {url}")
    try:
        res = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(res.text, 'html.parser')

        # Extraction de la référence
        ref_match = re.search(r'/piece-auto/([\w\-]+)/', url)
        ref_full = ref_match.group(1) if ref_match else "sansref"
        ref = ref_full.split("-")[0]

        # Titre
        titre = soup.find("h1").get_text(strip=True) if soup.find("h1") else "Titre inconnu"

        # Description HTML (fallback)
        desc = soup.find("div", class_="product-description")
        description_html = desc.get_text(" ", strip=True) if desc else "Aucune description"

        # Données JSON-LD
        jsonld = extract_jsonld_data(soup)

        image_url = jsonld.get("image")
        image_file = ""

        if image_url:
            if image_url.startswith("/"):
                image_url = urljoin(url, image_url)
            image_ext = os.path.splitext(image_url)[1].split("?")[0]
            image_file = f"{ref_full}{image_ext}"
            image_path = os.path.join(IMAGE_DIR, image_file)

            try:
                img_res = requests.get(image_url, headers=headers, timeout=10)
                if img_res.status_code == 200:
                    with open(image_path, "wb") as f_img:
                        f_img.write(img_res.content)
                    print(f"✅ Image téléchargée : {image_file}")
                else:
                    print(f"⚠️ Erreur téléchargement image : {image_url}")
                    image_file = ""
            except Exception as e:
                print(f"⚠️ Erreur lors du téléchargement de l’image : {e}")
                image_file = ""
        else:
            print(f"⚠️ Aucune image trouvée pour {ref_full}")

        # Sauvegarde TXT
        ligne_txt = f"{ref}#{titre}#{description_html}#{image_file}"
        with open(os.path.join(OUTPUT_DIR_TXT, f"{ref}.txt"), "w") as out_txt:
            out_txt.write(ligne_txt)

        # Sauvegarde JSON enrichi
        json_obj = {
            "ref": ref,
            "titre": titre,
            "description": jsonld.get("description") or description_html,
            "image": image_file,
            "name": jsonld.get("name"),
            "sku": jsonld.get("sku"),
            "mpn": jsonld.get("mpn"),
            "brand": jsonld.get("brand"),
            "price": jsonld.get("price")
        }
        with open(os.path.join(OUTPUT_DIR_JSON, f"{ref}.json"), "w") as out_json:
            json.dump(json_obj, out_json, ensure_ascii=False, indent=2)

    except Exception as e:
        print(f"⚠️ Erreur globale sur {url} : {e}")