import fitz  # PyMuPDF
import docx
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
from pdf2image import convert_from_bytes
import os
import platform
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from webapp.models.koleksi import Categories
from .ml_model_loader import predict_label
import json

# ======== Setup Tesseract Path (Windows) ========
if platform.system() == "Windows":
    BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    TESSERACT_PATH = os.path.join(BASE_DIR, 'tesseract', 'tesseract.exe')
    pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
    POPPLER_BIN = os.path.join(BASE_DIR, 'tools', 'poppler', 'Library', 'bin')
    os.environ["PATH"] += os.pathsep + POPPLER_BIN

# ======== Inisialisasi Stemmer Sastrawi ========
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# ============ 🔍 OCR Image Preprocessing ============
def preprocess_image_for_ocr(image):
    image = image.convert('L')  # grayscale
    image = image.filter(ImageFilter.MedianFilter())
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2)  # kontras 2x
    return image

# ============ 📄 Ekstrak Teks dari File ============
def extract_text_from_file(file):
    name = file.name.lower()

    if name.endswith('.pdf'):
        try:
            file.seek(0)
            doc = fitz.open(stream=file.read(), filetype="pdf")
            text = "\n".join([page.get_text() for page in doc])
            if text.strip():
                return text
        except Exception:
            pass

        # OCR fallback
        try:
            file.seek(0)
            images = convert_from_bytes(file.read())
        except Exception as e:
            print("[ERROR] PDF OCR gagal:", e)
            return "Dokumen tidak terbaca"

        ocr_text = []
        for img in images:
            processed = preprocess_image_for_ocr(img)
            result = pytesseract.image_to_string(processed, lang='ind')
            ocr_text.append(result)
        return "\n".join(ocr_text)

    elif name.endswith('.docx'):
        return extract_text_from_docx(file)

    elif name.endswith(('.jpg', '.jpeg', '.png')):
        try:
            image = Image.open(file)
            processed = preprocess_image_for_ocr(image)
            return pytesseract.image_to_string(processed, lang='ind')
        except Exception as e:
            print("[ERROR] Gagal OCR gambar:", e)
            return ""

    return ""

def extract_text_from_docx(file):
    doc = docx.Document(file)
    return "\n".join([para.text for para in doc.paragraphs])

# ============ 🧹 Normalisasi + Stemming ============
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    return stemmer.stem(text)

# ============ 🧠 Prediksi Kategori ============

LABEL_MAP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "indo_model_final", "label_mapping.json")
with open(LABEL_MAP_PATH, "r") as f:
    label_mapping = json.load(f)
def get_categories_from_db():
    return {cat.pk: cat.text for cat in Categories.objects.all()}

label_mapping = {int(k): v for k, v in label_mapping.items()}
reverse_label_map = {v: k for k, v in label_mapping.items()}

def predict_category(text):
    cleaned = clean_text(text)
    predicted_label = predict_label(cleaned)
    predicted_kategori = label_mapping.get(predicted_label, "Dokumen Lainnya")

    # Temukan PK dari kategori di database
    from webapp.models.koleksi import Categories
    kategori_obj = Categories.objects.filter(text__icontains=predicted_kategori).first()
    if kategori_obj:
        print("[DEBUG] Kategori terdeteksi:", predicted_kategori)
        return kategori_obj.pk
    else:
        print("[DEBUG] Tidak ditemukan, fallback ke default.")
        return None
# ============ 🏷️ Tag Suggestion (Unsupervised) ============
def suggest_tags(text, top_n=6):
    STOPWORDS = {
        "yang", "dalam", "akan", "dengan", "untuk", "pada", "dan", "atau",
        "saya", "bapak", "ibu", "tanggal", "hari", "tahun", "hal", "perihal"
    }

    stemmed = clean_text(text)
    words = stemmed.split()
    print("[DEBUG] Hasil OCR + stemmed:", words)

    filtered = [w for w in words if len(w) > 3 and w not in STOPWORDS]
    freq = Counter(filtered).most_common(top_n)

    tags = [w for w, _ in freq]
    print("[DEBUG] Suggested tags (unsupervised):", tags)
    return tags
