o
    ؁h                     @   sD  d dl Z d dlZd dlZd dlmZmZmZ d dlmZ d dl	Z	d dl
Z
d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ e
 d	kr|e	je	je	jeZe	jed
dZeej_e	jeddddZe	j d  e	j!e 7  < e Z"e"# Z$dd Z%dd Z&dd Z'dd Z(dd Z)dd Z*d ddZ+dS )!    N)ImageImageEnhanceImageFilter)convert_from_bytes)Counter)TfidfVectorizer)cosine_similarity)StemmerFactory)
CategoriesWindows	tesseractztesseract.exetoolspopplerLibrarybinPATHc                 C   s0   |  d} | t } t| }|d} | S )NL   )convertfilterr   MedianFilterr   Contrastenhance)imageenhancer r   2/var/www/html/mfile_papteng_new/webapp/ml_utils.pypreprocess_image_for_ocr   s
   


r   c              
   C   sb  | j  }|drwz!| d tj|  dd}ddd |D }| r*|W S W n	 t	y4   Y nw z| d t
|  }W n t	yZ } ztd| W Y d }~d	S d }~ww g }|D ]}t|}tj|d
d}	||	 q_d|S |drt| S |drzt| }
t|
}tj|d
dW S  t	y } ztd| W Y d }~dS d }~ww dS )Nz.pdfr   pdf)streamfiletype
c                 S   s   g | ]}|  qS r   )get_text).0pager   r   r   
<listcomp>,       z*extract_text_from_file.<locals>.<listcomp>z[ERROR] PDF OCR gagal:zDokumen tidak terbacaind)langz.docx)z.jpgz.jpegz.pngz[ERROR] Gagal OCR gambar: )namelowerendswithseekfitzopenreadjoinstrip	Exceptionr   printr   pytesseractimage_to_stringappendextract_text_from_docxr   )filer*   doctextimageseocr_textimg	processedresultr   r   r   r   extract_text_from_file%   sN   









rB   c                 C   s    t | }ddd |jD S )Nr!   c                 S   s   g | ]}|j qS r   r;   )r#   parar   r   r   r%   Q   s    z*extract_text_from_docx.<locals>.<listcomp>)docxDocumentr1   
paragraphs)r9   r:   r   r   r   r8   O   s   
r8   c                 C   s    |   } tdd| } t| S )Nz[^a-z\s] )r+   resubstemmerstemrC   r   r   r   
clean_textT   s   
rM   c                   C   s   dd t j D S )Nc                 S   s   i | ]}|j |jqS r   )pkr;   )r#   catr   r   r   
<dictcomp>[   s    z*get_categories_from_db.<locals>.<dictcomp>)r
   objectsallr   r   r   r   get_categories_from_dbZ   s   rS   c                 C   s   t  }t| }t| |g }t |}t|d |d d }| }| }|| }d}	||	k rOt	d|  t
jjdd }
|
rG|
jS t| | S t	dt| | d|  t| | S )Ng      ?zA[DEBUG] Skor terlalu rendah (%.2f), fallback ke 'Dokumen Lainnya'zdokumen lainnya)text__icontainsz[DEBUG] Predicted category pk:z
score=%.3f)rS   rM   listvaluesr   fit_transformr   flattenargmaxr4   r
   rQ   r   firstrN   keys)r;   
categoriesstemmedcorpustfidfsimsscoresbest_idx
best_score	thresholdfallbackr   r   r   predict_category]   s   rg      c                    s^   h d t | }| }td|  fdd|D }t||}dd |D }td| |S )N>   danhalibuakanatauharipadasayayangbapakdalamtahununtukdenganperihaltanggalz[DEBUG] Hasil OCR + stemmed:c                    s$   g | ]}t |d kr| vr|qS )   )len)r#   w	STOPWORDSr   r   r%   |   s   $ z suggest_tags.<locals>.<listcomp>c                 S   s   g | ]\}}|qS r   r   )r#   r{   _r   r   r   r%      r&   z&[DEBUG] Suggested tags (unsupervised):)rM   splitr4   r   most_common)r;   top_nr^   wordsfilteredfreqtagsr   r|   r   suggest_tagsr   s   

r   )rh   ),r.   rE   r5   PILr   r   r   	pdf2imager   osplatformrI   collectionsr   sklearn.feature_extraction.textr   sklearn.metrics.pairwiser   Sastrawi.Stemmer.StemmerFactoryr	   webapp.models.koleksir
   systempathdirnameabspath__file__BASE_DIRr1   TESSERACT_PATHtesseract_cmdPOPPLER_BINenvironpathsepfactorycreate_stemmerrK   r   rB   r8   rM   rS   rg   r   r   r   r   r   <module>   s8   *