o
    0i                     @   s   d dl Z d dlmZ d dlZd dlmZ d dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZmZmZ d d	lmZ d
d ZdddZdd Zdd Zdd Zdd Zdd Ze
dddd ZdS )    N)settings)convert_from_path)Image)shared_task)SentenceTransformer)predict_categorysuggest_tags)insert_fulltextinsert_chunksinsert_embeddings)embed_modelc                 C   s:   dd l }|dd| } |dd| } |dd| } |  S )Nr   z\r z\n\s+\n
z\s+ )resubstrip)textr    r   //var/www/html/mfile_papteng_new/webapp/tasks.py
clean_text   s
   r      c                 C   s   | sg S g }d}t | }||k rCt|| |}||k r.| | dkr.| d||}||kr.|}| ||  }|r=|| |}||k s|S )Nr   r   )lenminrfindr   append)r   sizechunksstartLendbackchunkr   r   r   
chunk_text#   s"   
r#   c                 C   s   t j| d  }|dv S )N   )z.jpgz.jpegz.pngz.bmpz.tiffz.webp)ospathsplitextlower)r&   extr   r   r   is_image?   s   r*   c                 C   s   |   dS )Nz.pdf)r(   endswith)r&   r   r   r   is_pdfD   s   r,   c                 C   s   t | }tj|ddS )Nindlang)r   openpytesseractimage_to_string)r&   imgr   r   r   extract_image_textK   s   
r4   c                 C   s,   t | }|d}|  }t|dkS )Nr   )fitzr0   	load_pageget_textr   r   )pdf_pathdocpager   r   r   r   
is_scannedP   s   

r;   c                 C   s"   t | }dd |D }d|S )Nc                 S   s   g | ]}|  qS r   )r7   ).0r:   r   r   r   
<listcomp>Y   s    z$extract_pdf_text.<locals>.<listcomp>r   )r5   r0   join)r8   r9   textsr   r   r   extract_pdf_textW   s   

r@   T)bindc              
   C   s  ddl m} |jj|d}|jd}tjtj	|}tj
|s(td| dS z=t|r2t|}n2t|r\t|rWt|}g }|D ]}	tj|	dd}
||
 qBd	|}nt|}ntd
| W dS W n ty} } ztd| W Y d }~dS d }~ww t|}z
t|}t|}W n ty } ztd| d }g }W Y d }~nd }~ww t|j||d t|dd}t|j|}g }tdt|dD ]}|||d  }tj |dd}|!t"| qt#|| td|  dS )Nr   )Attachments)pkz/\z[ERROR] File tidak ditemukan:Fr-   r.   r   z[ERROR] Unsupported file type:z[ERROR] Full OCR/Extract gagal:z[WARNING] Predict error:)attachment_idraw_textcleaned_textr   )r      T)convert_to_numpyz+[CELERY] Fulltext completed for attachment )$webapp.models.koleksirB   objectsgetr&   lstripr%   r>   r   FILE_DIRexistsprintr*   r4   r,   r;   r   r1   r2   r   r@   	Exceptionr   r   r   r	   rC   r#   r
   ranger   r   encodeextendlistr   )selfrD   rB   attrelative	file_path	full_textpagesrawpgtxtecleanedcategory_idtagsr   	chunk_idsall_embeddingsibatchembr   r   r   process_attachment_fulltext`   sh   





	
rg   )r   )r%   django.confr   r5   	pdf2imager   r1   PILr   celeryr   sentence_transformersr   webapp.ml_loader_svcr   r   webapp.db_utilsr	   r
   r   webapp.ml.e5larger   r   r#   r*   r,   r4   r;   r@   rg   r   r   r   r   <module>   s(    
	