o
    ȅ5iVB                  
   @   s  d dl Z d dlZe jdd e  d dlmZ d dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dlmZmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) e* dkre j+,e j+,e j+-e.Z/e j+0e/dde
j
_1e j+0e/ddddZ2e jd  e j3e2 7  < e# 4 Z5e% Z6e7e68 Z9d dl:Z:e j+0e j+,e j+-e.dZ;z*e j+0e;dZ<e j+0e;dZ=e j+0e;dZ>e:?e<a@e:?e=aAe:?e>aBeCd W n  eDy& ZE zda@daAdaBeCdeE  W Y dZE[EndZE[Eww i ZFdejfdd ZGd!d" ZHdHd$d%ZId&d' ZJd(eKd)eKfd*d+ZLd(eKd)eKfd,d-ZMd.d/ ZNdHd0d1ZOdId5d6ZPdJd7d8ZQd9d: ZRd;d< ZSdKd>d?ZTdLd@dAZUd(eKfdBdCZVdMd(eKdEeWfdFdGZXdS )N    NDJANGO_SETTINGS_MODULEzwebapp.settings)
Categories)convert_from_bytesconvert_from_path)ImageImageEnhanceImageFilterImageOps)fix_text)	unidecode)Counter)Pool)partial)StemmerFactory)StopWordRemoverFactory)TfidfVectorizer)cosine_similarityWindows	tesseractztesseract.exetoolspopplerLibrarybinPATHtrained_nlp_modelz	model.pklzvectorizer.pklzlabel_encoder.pklu$   [INFO] Model NLP berhasil dimuat ✅z[WARN] Model NLP belum dimuat: imgc                 C   s   t |   S )z!Hash cepat halaman untuk caching.)hashlibmd5tobytes	hexdigestr    r!   7/var/www/html/mfile_papteng_new/webapp/ml_loader_svc.pyhash_image_turboC   s   r#   c                 C   s$   |  d} t| } | tj} | S )NL)convertr	   autocontrastfilterr   SHARPENr    r!   r!   r"   preprocess_image_turboH   s   

r)   ind+engc                 C   sL   t | }|tv rt| S t| } tj| |d}tdd| }|t|< |S Nlang\s+ )r#   TURBO_PAGE_CACHEr)   pytesseractimage_to_stringresubstrip)r   r-   htextr!   r!   r"   process_page_turboO   s   r8   c                 C   s8   |  dd}| }t|dd  }|t|  S )N)   i  r$   r9   )resizer%   	histogramsumlengetdata)r   	img_smallpixelswhite_pixelsr!   r!   r"   turbo_text_density\   s   rB   r7   returnc                 C   s  t | } t| } tdd| } tdd| } tdd| } tdd| } tdd| } td	d
| } tdd| } tdd| } tdd| } tdd| } tdd| } tdd| } tdd| } tdd| } tdd| } tdd| } tdd| } tdd| } |  S )Nu   [_\-–—=]+r/   z([a-z])\s*-\s*([a-z])z\1\2z([a-z])\1{2,}z\1z(ee|ae|oe|ie){2,}z[^\w\s.,:()/\-]z(\d)\s*[\.,]\s*(\d)z\1.\2z(\d)\s*:\s*(\d)z\1:\2z\s*r\s*p\s*z rpz\s+[a-zA-Z0-9]\s+z	(\d):(\d)r.   z\b[a-zA-Z]{25,}\b z(?<=[a-z])(?=[A-Z])z([a-z]{2,})([A-Z]{2,})z\1 \2z\.{2,}z. z[^a-zA-Z0-9.,;:()\-\n\s]z\s*\.\s*\.\s*)r
   r   r3   r4   r5   r7   r!   r!   r"   cleaning_ocr_textf   s*   rF   c                 C   s2   |    }dd |D }dd |D }d|S )Nc                 S   s   g | ]}|t vr|qS r!   )	stopwords.0tr!   r!   r"   
<listcomp>   s    z&normalize_ocr_text.<locals>.<listcomp>c                 S   s   g | ]}t |qS r!   )stemmerstemrH   r!   r!   r"   rK          r/   )lowersplitjoin)r7   tokensr!   r!   r"   normalize_ocr_text}   s   
rS   c                 C   s>   t | d}t|ddtj\}}t |}|j}|| S )Nr$         )nparrayr%   cv2	thresholdTHRESH_BINARY_INVcount_nonzerosize)pil_imgr   _binarytext_pixelstotal_pixelsr!   r!   r"   fast_text_density   s
   
rb   c                 C   s$   t j| |d}tdd| }|S r+   )r1   r2   r3   r4   r5   )pager-   r7   r!   r!   r"   process_page   s   rd         0u  c                    s  |  d |  }t||d|d}g }t|D ]\}}	|dkr%||	 qt|	}
|
dkr2||	 qg }d}tdtt	 d d}t
|d, fd	d
|D }|D ]}| }|t|7 }||krg n|| qUW d    n1 sww   Y  d|S )Nr   jpeg)dpifmt	last_page
ףp=
?         	processesc                    s   g | ]
} t| fqS r!   )apply_asyncr8   )rI   r   r-   poolr!   r"   rK          z ocr_pdf_fast.<locals>.<listcomp>
)seekreadr   	enumerateappendrB   maxminos	cpu_countr   getr=   rQ   )fileri   r-   	max_pages	max_chars	pdf_bytespages
kept_pagesir   ratiotextstotal_charsworkersjobsjobr7   r!   rs   r"   ocr_pdf_fast   s4   



r   c                    s8  |d u rt jt jt jtd}t j|dd t j|d}| d t|d}|	| 
  W d    n1 s=w   Y  t jt j|d }t j|| t j dd t||ddd	d
}t|D ]\}	}
t j d|	d dd}|
|d qi fddtt  D }|d g}|dd  }|D ]}t|}t|}|dkrt | q fddtt  D }|d d }g }d}tt|d}ttdt  d d&}|||D ]}||krtd  n|t|7 }|| qW d    n	1 sw   Y  tj dd t | d|S )N	ocr_cacheT)exist_okztemp_ocr_input.pdfr   wbrh   rf      )ri   rj   rk   thread_countpage_rm   03d.jpgJPEGc                    $   g | ]}| d rtj |qS r   endswithr}   pathrQ   rI   fcache_subdirr!   r"   rK         $ z"ocr_pdf_cached.<locals>.<listcomp>rl   c                    r   r   r   r   r   r!   r"   rK      r   r,   rn   rp   u4   ⚠️ Limit 50.000 karakter tercapai, hentikan OCR.)ignore_errorsrv   ) r}   r   rQ   dirnameabspath__file__makedirsrw   openwriterx   splitextbasenamer   ry   savesortedlistdirr   rb   remover   rd   r   r|   r~   imapprintr=   rz   shutilrmtree)r   	cache_dirri   r-   r   temp_pdf_pathr   pdf_namer   r   rc   
image_pathcached_imagesr   cache_imager   r   r   r   process_funcrt   r7   r!   r   r"   ocr_pdf_cached   sR   





r   c                   C   s   dd t j D S )Nc                 S   s   i | ]}|j |jqS r!   )pkr7   )rI   catr!   r!   r"   
<dictcomp>   rN   z*get_categories_from_db.<locals>.<dictcomp>)r   objectsallr!   r!   r!   r"   get_categories_from_db   s   r   c                 C   s   t j| sdS t j| d  }|dv r,zt| }tj|ddp$dW S    Y dS |dkrfz/t	| }|
d}|  }|rF|W S | }td|j|jg|j}tj|ddp^dW S    Y dS dS )	u   
    Extract cepat sesuai logika utama:
    - Gambar → OCR halaman 1
    - PDF scanned → OCR halaman 1 saja
    - PDF normal → extract text halaman 1 saja
    rD   rm   )r   .jpeg.pngz.bmpz.tiffz.webpindr,   .pdfr   RGB)r}   r   existsr   rO   r   r   r1   r2   fitz	load_pageget_textr5   
get_pixmap	frombyteswidthheightsamples)r   extr   docrc   r7   pixr!   r!   r"   quick_extract_text   s,   	


r   Tc              
   C   s  t   }| j }d}|droz7| d tj|  dd}ddd |D }t	|
 d	kr6t|}ntd
t	||krF|d | }W nf tyn } z|rYtd| d | d t| |d}W Y d }~nDd }~ww |drdd l}	|	| }ddd |jD }t|}n!|drt| }
tj|
dd}t|}n|rtd|  dS t   | }|rtd|ddt	|  td|d d  d |
 S )NrD   r   r   pdf)streamfiletyperv   c                 S   s   g | ]}|  qS r!   )r   )rI   rc   r!   r!   r"   rK   #      z*extract_text_from_file.<locals>.<listcomp>2   z6Teks hasil fitz terlalu sedikit, gunakan OCR fallback.z[WARN] PDF fitz gagal: z, fallback ke OCR (cached)...)r   z.docxc                 S   s   g | ]}|j qS r!   rE   )rI   pr!   r!   r"   rK   2  s    )r   r   r   r*   r,   z#[WARN] Format file tidak didukung: z[INFO] Ekstraksi selesai dalam .2fz detik, jumlah karakter: z[DEBUG] Preview teks:
i  z...)timenamerO   r   rw   r   r   rx   rQ   r=   r5   rF   
ValueError	Exceptionr   r   docxDocument
paragraphsr   r1   r2   )r   r   log
start_timer   
hasil_textr   r7   er   imageelapsedr!   r!   r"   extract_text_from_file  sH   










r   c              
   C   sT  t   }ztrAtrAtrAt| }t|g}t|}t|d }tj	j
|d }|r0|j}nHtj	j
dd }	|	r>|	jnd }n7|rGtd t }
t| }t|
 |g }t |}t|d |d d }| }| }t|
 | }t   | }|rtd|dd|  |W S  ty } z|rtd	|  W Y d }~d S d }~ww )
Nr   )text__icontainsdokumen lainnyaz9[WARN] Model belum dimuat, fallback ke cosine similarity.z'[INFO] Prediksi kategori selesai dalam r   z detik, kategori ID: z![ERROR] Gagal prediksi kategori: )r   ml_model
vectorizerlabel_encoderrF   	transformpredictinverse_transformr   r   r'   firstr   r   r   rS   listvaluesr   fit_transformr   flattenargmaxkeysr   )r7   r   r   cleanedvecpredlabelr   resultfallback
categoriesstemmedcorpustfidfsimsscoresbest_idxr   r   r!   r!   r"   predict_categoryH  s:   
r  c                 C   s   ddl m } t|  }td|}|r|d nd}td|}|r*|dnd}td|}|r;|d nd}td|}	|	rL|	d nd}
||||
d	S )
z
    Ekstrak entitas penting dari teks hasil OCR secara cepat.
    Mengambil: instansi, tahun, daerah, dan kategori dasar.
    r   )datetimez&(dinas|badan|sekretariat)[a-z\s]{3,40}Nz\b(20\d{2})\brm   z)(provinsi|kabupaten|kota)\s+[a-z\s]{3,40}zf(peraturan daerah|peraturan bupati|nota dinas|kontrak|spm|sp2d|surat keputusan|produk hukum|pengadaan))instansitahundaerahkategori)r  r   rO   r3   searchgroupr5   )r7   r  text_lowinstansi_matchr  tahun_matchr  daerah_matchr  kategori_matchr	  r!   r!   r"   extract_entities_from_ocrp  s$   r  r   top_nc                    s  zt | }W n ty   i }Y nw |d}|d}|d}|d}i dg ddddgd	d	d
gddgddgdddgdddgdg ddddgdddgdg ddg ddddgd d!d"gd#d#d$gd%g d&d'g d(d)gd*d+gd,d-gd.gd/}|   d0}| D ]\}	}
t fd1d2|
D r|	} nq|s|r|}td3d4 }d5d6 | D }d7d6 t	|
|D }g }||||fD ]}|rt| r|t|  q|| ttd8d6 |D d0d9 }td:|  |S );u   
    Analisis teks dokumen dan hasil entitas untuk menghasilkan tag otomatis.
    ⚡ Optimized: sangat cepat dan hasil maksimal 4 tag unik.
    r	  r  r  r  produk hukum)peraturan daerahperaturan bupatir  r  perdar  perbupzsurat edaranz
nota dinaszsurat keputusan bupatiz	sk bupatizsurat keputusan skpdzsurat keputusan kepala dinasskpdkontrak)r  spkzperjanjian kerjazpengadaan barangzbarang dan jasazpengadaan kontruksi	kontruksipembangunanzrumah dan kontrakan)	kontrakanzrumah dinasz
sewa rumahzjalan dan jembatan)jalanjembatanaspalzsarana dan prasarana lainsarana	prasaranazpengairan dan irigasi	pengairanirigasizsurat berhargaobligasizspp, spm, sp2d)sppspmsp2dzsurat tagihan, nota, kwitansi)notakwitansitagihan	disposisizdokumen lainlainnya	konsultan
konsultasijasa lainnya)r,  r   zjasa konsultasir0  Nc                 3   s    | ]}| v V  qd S )Nr!   )rI   w
text_lowerr!   r"   	<genexpr>  s    zsuggest_tags.<locals>.<genexpr>z[^a-zA-Z\s]r/   c                 S   s   g | ]
}t |d kr|qS r   )r=   rH   r!   r!   r"   rK     ru   z suggest_tags.<locals>.<listcomp>c                 S   s   g | ]\}}|qS r!   r!   )rI   r1  r^   r!   r!   r"   rK     r   c                 S   s    g | ]}t |d kr| qS )rn   )r=   titlerH   r!   r!   r"   rK     s     r   z[AUTO TAG] Hasil tag: )r  r   r   rO   itemsanyr3   r4   rP   r   most_commonstrr5   rz   extendr   dictfromkeysr   )r7   r  entitiesr	  r  r  r  kategori_mapkategori_detectedkwords
text_basicrR   keywords	hasil_tagitemr!   r2  r"   suggest_tags  s   



	

 rG  )r*   )re   r*   rf   rg   )Nre   r*   rg   )rg   T)Tr5  )Yr}   djangoenviron
setdefaultsetupwebapp.models.koleksir   r   r3   r   r1   platformnumpyrV   rX   r   r   	pdf2imager   r   PILr   r   r   r	   ftfyr
   r   collectionsr   multiprocessingr   	functoolsr   Sastrawi.Stemmer.StemmerFactoryr   /Sastrawi.StopWordRemover.StopWordRemoverFactoryr   sklearn.feature_extraction.textr   sklearn.metrics.pairwiser   systemr   r   r   r   BASE_DIRrQ   tesseract_cmdPOPPLER_BINpathsepcreate_stemmerrL   stop_factorysetget_stop_wordsrG   joblib	MODEL_DIR
model_pathvectorizer_pathlabel_encoder_pathloadr   r   r   r   r   r   r0   r#   r)   r8   rB   r:  rF   rS   rb   rd   r   r   r   r   r   r  r  intrG  r!   r!   r!   r"   <module>   s    





	


*/
-
.($