o
    Ŧh                      @   sh  d dl Z d dlZd dlZd dlmZ d dlmZmZmZ d dl	m
Z
 d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d dlZd dlZe d
krejejejeZej eddZ!e!ej_"ej eddddZ#ej$d  ej%e# 7  < e Z&e&' Z(dd Z)d dlZdd Z*dd Z+dd Z,dd Z-dd Z.d!dd Z/dS )"    N)ProcessPoolExecutor)ImageImageEnhanceImageFilter)convert_from_bytes)Counter)TfidfVectorizer)cosine_similarity)StemmerFactory)
CategoriesWindows	tesseractztesseract.exetoolspopplerLibrarybinPATHc                 C   s0   |  d} | t } t| }|d} | S )NL   )convertfilterr   MedianFilterr   Contrastenhance)imageenhancer r   :/var/www/html/mfile_papteng_new/webapp/ml_utils_before1.pypreprocess_image_for_ocr   s
   


r   c              
   C   s  | j  }|drz5| d tj|  dd}ddd |D }| r?t	d t	d	t
|d
 t	|d d  |W S W n	 tyI   Y nw z| d t|  }W n tyo } zt	d| W Y d }~dS d }~ww |d d }t	dt
| d dd }t pd}t	d| d t }tjjt d}	t|	||}
W d    n1 sw   Y  t }|| }d|
}| }t
|dkrd|d d }t	d t	d t	| t	d t	d|ddt
| d t	d t
|  d! |S |d"rt| S |d#rozBt }t| }t|}tj|d$d%}t|}t | }t	d& t	| t	d t	d'|dd( t	d t
|  d! |W S  tyn } zt	d)| W Y d }~d*S d }~ww d*S )+Nz.pdfr   pdf)streamfiletype
c                 S   s   g | ]}|  qS r   )get_text).0pager   r   r   
<listcomp>[       z*extract_text_from_file.<locals>.<listcomp>z<[INFO] Ekstraksi teks langsung dari PDF berhasil (tanpa OCR)z$[INFO] Panjang teks hasil ekstraksi:karakteri  z[ERROR] PDF OCR gagal:zDokumen tidak terbaca   z[INFO] Mulai OCR z halaman secara paralel...c                 S   s"   t | }tj|dd}t|}|S )Nindlang)r   pytesseractimage_to_string
clean_text)img	processedtextcleanedr   r   r   ocr_pageq   s   z(extract_text_from_file.<locals>.ocr_page   z[DEBUG] Menggunakan z threads CPU untuk OCR paralel)max_workersiP   z-[INFO] Pemotongan teks ke 50.000 kata pertamaz2
=========== HASIL OCR FINAL (bersih) ===========
z1
===============================================
z[INFO] OCR selesai dalam z.2fz detik untuk z halamanz![INFO] Total panjang teks akhir: z kataz.docx)z.jpgz.jpegz.pngr*   r+   z3
=========== HASIL OCR GAMBAR (bersih) ===========
z [INFO] OCR gambar selesai dalam z detikz[ERROR] Gagal OCR gambar: )namelowerendswithseekfitzopenreadjoinstripprintlen	Exceptionr   os	cpu_counttimeperf_counter
concurrentfuturesThreadPoolExecutorlistmapsplitextract_text_from_docxr   r   r-   r.   r/   )filer9   docr2   imageser4   r6   
start_timeexecutorresultsend_timeelapsed	full_textwordsr   r1   r3   r   r   r   extract_text_from_fileT   s   






r[   c                 C   s    t | }ddd |jD S )Nr"   c                 S   s   g | ]}|j qS r   r2   )r$   parar   r   r   r&      s    z*extract_text_from_docx.<locals>.<listcomp>)docxDocumentr@   
paragraphs)rP   rQ   r   r   r   rO      s   
rO   c                 C   s    |   } tdd| } t| S )Nz[^a-z\s]r7   )r:   resubstemmerstemr\   r   r   r   r/      s   
r/   c                   C   s   dd t j D S )Nc                 S   s   i | ]}|j |jqS r   )pkr2   )r$   catr   r   r   
<dictcomp>   s    z*get_categories_from_db.<locals>.<dictcomp>)r   objectsallr   r   r   r   get_categories_from_db   s   rj   c                 C   s   t  }t| }t| |g }t |}t|d |d d }| }| }|| }d}	||	k rOt	d|  t
jjdd }
|
rG|
jS t| | S t	dt| | d|  t| | S )Ng      ?zA[DEBUG] Skor terlalu rendah (%.2f), fallback ke 'Dokumen Lainnya'zdokumen lainnya)text__icontainsz[DEBUG] Predicted category pk:z
score=%.3f)rj   r/   rL   valuesr   fit_transformr	   flattenargmaxrB   r   rh   r   firstre   keys)r2   
categoriesstemmedcorpustfidfsimsscoresbest_idx
best_score	thresholdfallbackr   r   r   predict_category   s   r}      c                    s^   h d t | }| }td|  fdd|D }t||}dd |D }td| |S )N>   danhalibuakanatauharipadasayayangbapakdalamtahununtukdenganperihaltanggalz[DEBUG] Hasil OCR + stemmed:c                    s$   g | ]}t |d kr| vr|qS )   )rC   )r$   w	STOPWORDSr   r   r&      s   $ z suggest_tags.<locals>.<listcomp>c                 S   s   g | ]\}}|qS r   r   )r$   r   _r   r   r   r&      r'   z&[DEBUG] Suggested tags (unsupervised):)r/   rN   rB   r   most_common)r2   top_nrt   rZ   filteredfreqtagsr   r   r   suggest_tags   s   

r   )r~   )0r=   r^   r-   concurrent.futuresr   PILr   r   r   	pdf2imager   rE   platformra   collectionsr   sklearn.feature_extraction.textr   sklearn.metrics.pairwiser	   Sastrawi.Stemmer.StemmerFactoryr
   webapp.models.koleksir   rI   rG   systempathdirnameabspath__file__BASE_DIRr@   TESSERACT_PATHtesseract_cmdPOPPLER_BINenvironpathsepfactorycreate_stemmerrc   r   r[   rO   r/   rj   r}   r   r   r   r   r   <module>   s@   2[