a
    ؁h                     @   sD  d dl Z d dlZd dlZd dlmZmZmZ d dlmZ d dl	Z	d dl
Z
d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ e
 d	kre	je	je	jeZe	jed
dZeej_e	jeddddZe	j d  e	j!e 7  < e Z"e"# Z$dd Z%dd Z&dd Z'dd Z(dd Z)dd Z*d ddZ+dS )!    N)ImageImageEnhanceImageFilter)convert_from_bytes)Counter)TfidfVectorizer)cosine_similarity)StemmerFactory)
CategoriesWindowsZ	tesseractztesseract.exeZtoolsZpopplerLibrarybinPATHc                 C   s0   |  d} | t } t| }|d} | S )NL   )convertfilterr   ZMedianFilterr   ContrastZenhance)imageZenhancer r   ED:\WORK\M-FILE\MFILE_PAPTENG_NEW\mfile_papteng_new\webapp\ml_utils.pypreprocess_image_for_ocr   s
    


r   c              
   C   sh  | j  }|drzB| d tj|  dd}ddd |D }| rT|W S W n t	yh   Y n0 z| d t
|  }W n0 t	y } ztd| W Y d }~d	S d }~0 0 g }|D ]$}t|}tj|d
d}	||	 qd|S |drt| S |drdz"t| }
t|
}tj|d
dW S  t	yb } ztd| W Y d }~dS d }~0 0 dS )Nz.pdfr   pdf)streamfiletype
c                 S   s   g | ]}|  qS r   )get_text).0pager   r   r   
<listcomp>,       z*extract_text_from_file.<locals>.<listcomp>z[ERROR] PDF OCR gagal:zDokumen tidak terbacaind)langz.docx)z.jpgz.jpegz.pngz[ERROR] Gagal OCR gambar: )namelowerendswithseekfitzopenreadjoinstrip	Exceptionr   printr   pytesseractZimage_to_stringappendextract_text_from_docxr   )filer$   doctextimageseZocr_textimg	processedresultr   r   r   r   extract_text_from_file%   sB    








r:   c                 C   s    t | }ddd |jD S )Nr   c                 S   s   g | ]
}|j qS r   r4   )r   parar   r   r   r   Q   r    z*extract_text_from_docx.<locals>.<listcomp>)docxDocumentr+   
paragraphs)r2   r3   r   r   r   r1   O   s    
r1   c                 C   s    |   } tdd| } t| S )Nz[^a-z\s] )r%   resubstemmerstemr;   r   r   r   
clean_textT   s    rE   c                   C   s   dd t j D S )Nc                 S   s   i | ]}|j |jqS r   )pkr4   )r   catr   r   r   
<dictcomp>[   r    z*get_categories_from_db.<locals>.<dictcomp>)r
   objectsallr   r   r   r   get_categories_from_dbZ   s    rK   c                 C   s   t  }t| }t| |g }t |}t|d |d d }| }| }|| }d}	||	k rt	d|  t
jjdd }
|
r|
jS t| | S t	dt| | d|  t| | S )Ng      ?zA[DEBUG] Skor terlalu rendah (%.2f), fallback ke 'Dokumen Lainnya'zdokumen lainnya)Ztext__icontainsz[DEBUG] Predicted category pk:z
score=%.3f)rK   rE   listvaluesr   Zfit_transformr   flattenargmaxr.   r
   rI   r   firstrF   keys)r4   
categoriesstemmedZcorpusZtfidfZsimsZscoresZbest_idxZ
best_score	thresholdfallbackr   r   r   predict_category]   s    rW      c                    s^   h d t | }| }td|  fdd|D }t||}dd |D }td| |S )N>   ZdalamharitahunperihalZakanZibuZsayaZyangZpadaZatauZbapakZdenganZhalZuntukZdantanggalz[DEBUG] Hasil OCR + stemmed:c                    s$   g | ]}t |d kr| vr|qS )   )len)r   wZ	STOPWORDSr   r   r   |   r    z suggest_tags.<locals>.<listcomp>c                 S   s   g | ]\}}|qS r   r   )r   r_   _r   r   r   r      r    z&[DEBUG] Suggested tags (unsupervised):)rE   splitr.   r   most_common)r4   Ztop_nrT   wordsfilteredfreqtagsr   r`   r   suggest_tagsr   s    

rh   )rX   ),r(   r=   r/   PILr   r   r   	pdf2imager   osplatformrA   collectionsr   Zsklearn.feature_extraction.textr   Zsklearn.metrics.pairwiser   ZSastrawi.Stemmer.StemmerFactoryr	   webapp.models.koleksir
   systempathdirnameabspath__file__BASE_DIRr+   ZTESSERACT_PATHZtesseract_cmdZPOPPLER_BINenvironpathsepfactoryZcreate_stemmerrC   r   r:   r1   rE   rK   rW   rh   r   r   r   r   <module>   s6   *