import os
from django.conf import settings
import fitz  # PyMuPDF
from pdf2image import convert_from_path
import pytesseract
from PIL import Image


from celery import shared_task
from sentence_transformers import SentenceTransformer

from webapp.ml_loader_svc import predict_category, suggest_tags
from webapp.db_utils import insert_fulltext, insert_chunks, insert_embeddings


# ======================================================
# Load embedding model sekali untuk worker Celery
# ======================================================
from webapp.ml.e5large import embed_model
# EMBEDDING_MODEL = "intfloat/multilingual-e5-large"
# embed_model = SentenceTransformer(EMBEDDING_MODEL)


# ======================================================
# Utility
# ======================================================
def clean_text(text):
    import re
    text = re.sub(r'\r', '', text)
    text = re.sub(r'\n\s+\n', '\n', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


def chunk_text(text, size=800):
    if not text:
        return []

    chunks = []
    start = 0
    L = len(text)

    while start < L:
        end = min(start + size, L)

        if end < L and text[end] != ' ':
            back = text.rfind(' ', start, end)
            if back > start:
                end = back

        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)

        start = end

    return chunks


# ======================================================
# DETEKSI FILE
# ======================================================
def is_image(path):
    ext = os.path.splitext(path)[1].lower()
    return ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp"]


def is_pdf(path):
    return path.lower().endswith(".pdf")


# ======================================================
# OCR & PDF extract
# ======================================================
def extract_image_text(path):
    img = Image.open(path)
    return pytesseract.image_to_string(img, lang="ind")


def is_scanned(pdf_path):
    doc = fitz.open(pdf_path)
    page = doc.load_page(0)
    text = page.get_text().strip()
    return len(text) == 0


def extract_pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    texts = [page.get_text() for page in doc]
    return "\n".join(texts)


# ======================================================
# TASK CELERY
# ======================================================
@shared_task(bind=True)
def process_attachment_fulltext(self, attachment_id):

    from webapp.models.koleksi import Attachments

    att = Attachments.objects.get(pk=attachment_id)

    relative = att.path.lstrip("/\\")
    file_path = os.path.join(settings.FILE_DIR, relative)

    if not os.path.exists(file_path):
        print("[ERROR] File tidak ditemukan:", file_path)
        return False

    # ----------------------------------------------------
    # 1. Extract FULL → PRIORITAS IMAGE > PDF
    # ----------------------------------------------------
    try:

        # CASE 1: IMAGE
        if is_image(file_path):
            full_text = extract_image_text(file_path)

        # CASE 2: PDF
        elif is_pdf(file_path):

            if is_scanned(file_path):
                pages = convert_from_path(file_path)
                raw = []
                for pg in pages:
                    txt = pytesseract.image_to_string(pg, lang="ind")
                    raw.append(txt)
                full_text = "\n".join(raw)
            else:
                full_text = extract_pdf_text(file_path)

        # CASE 3: FILE UNKNOWN
        else:
            print("[ERROR] Unsupported file type:", file_path)
            return False

    except Exception as e:
        print("[ERROR] Full OCR/Extract gagal:", e)
        return False

    cleaned = clean_text(full_text)

    # ----------------------------------------------------
    # 2. Predict Category + Tags
    # ----------------------------------------------------
    try:
        category_id = predict_category(cleaned)
        tags = suggest_tags(cleaned)
    except Exception as e:
        print("[WARNING] Predict error:", e)
        category_id = None
        tags = []

    # ----------------------------------------------------
    # 3. Simpan raw + cleaned
    # ----------------------------------------------------
    insert_fulltext(
        attachment_id=att.pk,
        raw_text=full_text,
        cleaned_text=cleaned
    )

    # ----------------------------------------------------
    # 4. Chunk
    # ----------------------------------------------------
    chunks = chunk_text(cleaned, size=800)
    chunk_ids = insert_chunks(att.pk, chunks)

    # ----------------------------------------------------
    # 5. Embedding
    # ----------------------------------------------------
    all_embeddings = []
    for i in range(0, len(chunks), 4):
        batch = chunks[i:i+4]
        emb = embed_model.encode(batch, convert_to_numpy=True)
        all_embeddings.extend(list(emb))

    insert_embeddings(chunk_ids, all_embeddings)

    print(f"[CELERY] Fulltext completed for attachment {attachment_id}")
    return True