import os
import cv2
import numpy as np
import PyPDF2
import docx
from PIL import Image
import tempfile
from pathlib import Path
import matplotlib.pyplot as plt

class PlagiarismDetectorORB:
    def __init__(self):
        # Inisialisasi ORB detector
        self.orb = cv2.ORB_create(
            nfeatures=2000,        # Jumlah fitur maksimal yang dideteksi
            scaleFactor=1.2,       # Faktor scaling untuk pyramid
            nlevels=8,             # Jumlah level pyramid
            edgeThreshold=31,      # Threshold edge
            firstLevel=0,          # Level pertama
            WTA_K=2,               # Jumlah bit untuk BRIEF
            scoreType=cv2.ORB_HARRIS_SCORE,  # Tipe scoring
            patchSize=31           # Ukuran patch
        )
        
        # Inisialisasi Brute-Force Matcher
        self.bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=False)
        
        # Threshold kemiripan
        self.similarity_threshold = 30  # Minimal 30% keypoints match untuk dianggap mirip
        
    def extract_images_from_pdf(self, pdf_path):
        """Ekstrak gambar dari file PDF"""
        images = []
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    if '/XObject' in page['/Resources']:
                        xobjects = page['/Resources']['/XObject'].get_object()
                        for obj in xobjects:
                            if xobjects[obj]['/Subtype'] == '/Image':
                                with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp_file:
                                    img_data = xobjects[obj].get_data()
                                    tmp_file.write(img_data)
                                    img = cv2.imread(tmp_file.name)
                                    if img is not None:
                                        images.append(img)
                                    os.unlink(tmp_file.name)
        except Exception as e:
            print(f"Error membaca PDF {pdf_path}: {e}")
        return images
    
    def extract_images_from_docx(self, docx_path):
        """Ekstrak gambar dari file DOCX"""
        images = []
        try:
            doc = docx.Document(docx_path)
            for rel in doc.part.rels.values():
                if "image" in rel.reltype:
                    img_blob = rel.target_part.blob
                    with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp_file:
                        tmp_file.write(img_blob)
                        img = cv2.imread(tmp_file.name)
                        if img is not None:
                            images.append(img)
                        os.unlink(tmp_file.name)
        except Exception as e:
            print(f"Error membaca DOCX {docx_path}: {e}")
        return images
    
    def extract_text(self, file_path):
        """Ekstrak teks dari dokumen"""
        text = ""
        ext = Path(file_path).suffix.lower()
        
        try:
            if ext == '.pdf':
                with open(file_path, 'rb') as file:
                    pdf_reader = PyPDF2.PdfReader(file)
                    for page in pdf_reader.pages:
                        text += page.extract_text()
            elif ext == '.docx':
                doc = docx.Document(file_path)
                for paragraph in doc.paragraphs:
                    text += paragraph.text + "\n"
        except Exception as e:
            print(f"Error ekstrak teks: {e}")
        
        return text
    
    def extract_features_orb(self, image):
        """
        Ekstrak fitur ORB dari gambar
        Returns: keypoints, descriptors
        """
        if image is None:
            return None, None
        
        # Konversi ke grayscale (ORB membutuhkan grayscale)
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image
        
        # Deteksi keypoints dan compute descriptors
        keypoints, descriptors = self.orb.detectAndCompute(gray, None)
        
        return keypoints, descriptors
    
    def match_features_bf(self, desc1, desc2):
        """
        Match fitur menggunakan Brute-Force Matcher dengan kNN
        Returns: good_matches, ratio
        """
        if desc1 is None or desc2 is None:
            return [], 0
        
        if len(desc1) == 0 or len(desc2) == 0:
            return [], 0
        
        # Brute-Force matching dengan kNN (k=2 untuk ratio test)
        matches = self.bf.knnMatch(desc1, desc2, k=2)
        
        # Apply ratio test (Lowe's ratio test)
        good_matches = []
        for match_pair in matches:
            if len(match_pair) == 2:
                m, n = match_pair
                if m.distance < 0.75 * n.distance:
                    good_matches.append(m)
        
        # Hitung persentase kemiripan
        similarity_ratio = len(good_matches) / min(len(desc1), len(desc2)) * 100
        
        return good_matches, similarity_ratio
    
    def calculate_similarity_orb(self, img1, img2):
        """
        Hitung kemiripan antara dua gambar menggunakan ORB + BF Matcher
        Returns: similarity_percentage, details
        """
        # Ekstrak fitur ORB
        kp1, desc1 = self.extract_features_orb(img1)
        kp2, desc2 = self.extract_features_orb(img2)
        
        if desc1 is None or desc2 is None:
            return 0, {'error': 'No features detected'}
        
        # Match fitur
        good_matches, similarity_ratio = self.match_features_bf(desc1, desc2)
        
        details = {
            'keypoints_img1': len(kp1) if kp1 else 0,
            'keypoints_img2': len(kp2) if kp2 else 0,
            'good_matches': len(good_matches),
            'similarity_ratio': similarity_ratio
        }
        
        return similarity_ratio, details
    
    def process_document(self, file_path):
        """Proses dokumen dan ekstrak gambar"""
        images = []
        ext = Path(file_path).suffix.lower()
        
        print(f"   Memproses {Path(file_path).name}...")
        
        if ext == '.pdf':
            images = self.extract_images_from_pdf(file_path)
        elif ext == '.docx':
            images = self.extract_images_from_docx(file_path)
        else:
            raise ValueError(f"Format file tidak didukung: {ext}")
        
        text = self.extract_text(file_path)
        
        print(f"   ✓ Diekstrak: {len(images)} gambar, {len(text)} karakter teks")
        
        return {
            'images': images,
            'text': text,
            'file_path': file_path,
            'num_images': len(images)
        }
    
    def detect_plagiarism(self, doc1_data, doc2_data):
        """
        Deteksi plagiarisme gambar menggunakan ORB + BF Matcher
        """
        images1 = doc1_data['images']
        images2 = doc2_data['images']
        
        if len(images1) == 0 or len(images2) == 0:
            return {
                'status': 'TIDAK ADA GAMBAR',
                'percentage': 0,
                'matches': [],
                'total_images_doc1': len(images1),
                'total_images_doc2': len(images2)
            }
        
        results = []
        max_similarity = 0
        best_match = None
        
        print(f"\n   Membandingkan {len(images1)} vs {len(images2)} gambar...")
        
        for i, img1 in enumerate(images1):
            for j, img2 in enumerate(images2):
                similarity, details = self.calculate_similarity_orb(img1, img2)
                
                if similarity > self.similarity_threshold:
                    match_info = {
                        'gambar_dokumen1': i + 1,
                        'gambar_dokumen2': j + 1,
                        'similarity': similarity,
                        'details': details
                    }
                    results.append(match_info)
                    
                    if similarity > max_similarity:
                        max_similarity = similarity
                        best_match = match_info
        
        # Tentukan status berdasarkan persentase kemiripan
        if max_similarity >= 70:
            status = "PLAGIARISME BERAT"
        elif max_similarity >= 50:
            status = "PLAGIARISME SEDANG"
        elif max_similarity >= 30:
            status = "TERINDIKASI PLAGIARISME"
        elif max_similarity > 0:
            status = "KEMIRIPAN RENDAH"
        else:
            status = "ORIGINAL"
        
        return {
            'status': status,
            'percentage': max_similarity,
            'matches': results,
            'best_match': best_match,
            'total_images_doc1': len(images1),
            'total_images_doc2': len(images2)
        }
    
    def visualize_matches(self, img1, img2, matches, kp1, kp2, output_path=None):
        """Visualisasikan matches antara dua gambar"""
        img_matches = cv2.drawMatches(
            img1, kp1, img2, kp2, matches[:50], None,
            flags=cv2.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS
        )
        
        if output_path:
            cv2.imwrite(output_path, img_matches)
        
        return img_matches
    
    def print_detailed_results(self, doc1_path, doc2_path, result):
        """Cetak hasil deteksi dengan format detail"""
        print("\n" + "="*70)
        print("HASIL DETEKSI PLAGIARISME GAMBAR (ORB + BRUTE-FORCE MATCHER)")
        print("="*70)
        print(f"Dokumen 1: {os.path.basename(doc1_path)}")
        print(f"Dokumen 2: {os.path.basename(doc2_path)}")
        print(f"Total gambar Dokumen 1: {result['total_images_doc1']}")
        print(f"Total gambar Dokumen 2: {result['total_images_doc2']}")
        print("-"*70)
        
        if result['matches']:
            print("\n📊 HASIL PERBANDINGAN GAMBAR:")
            for match in result['matches']:
                print(f"\n  🖼️  Gambar {match['gambar_dokumen1']} (Doc1) vs Gambar {match['gambar_dokumen2']} (Doc2)")
                print(f"     • Kemiripan: {match['similarity']:.2f}%")
                print(f"     • Keypoints terdeteksi: {match['details']['keypoints_img1']} vs {match['details']['keypoints_img2']}")
                print(f"     • Good matches (Lowe's test): {match['details']['good_matches']}")
            
            if result['best_match']:
                print(f"\n🎯 KEMIRIPAN TERTINGGI: {result['percentage']:.2f}%")
                print(f"   (Gambar {result['best_match']['gambar_dokumen1']} ↔ Gambar {result['best_match']['gambar_dokumen2']})")
        else:
            print("\n✅ Tidak ditemukan kemiripan gambar yang signifikan")
        
        print("-"*70)
        print(f"\n📋 STATUS AKHIR: {result['status']}")
        
        # Berikan rekomendasi berdasarkan status
        if result['percentage'] >= 70:
            print("⚠️  PLAGIARISME BERAT: Gambar hampir identik atau dimodifikasi minimal")
            print("   Rekomendasi: Tolak dokumen atau minta revisi")
        elif result['percentage'] >= 50:
            print("⚠️  PLAGIARISME SEDANG: Kemiripan struktur visual signifikan")
            print("   Rekomendasi: Perlu investigasi lebih lanjut")
        elif result['percentage'] >= 30:
            print("⚠️  TERINDIKASI: Ada kemiripan yang perlu diverifikasi")
            print("   Rekomendasi: Cek manual gambar yang dimaksud")
        else:
            print("✅ ORIGINAL: Tidak ditemukan indikasi plagiarisme")
            print("   Rekomendasi: Dokumen aman untuk diproses")
        
        print("="*70)
        
        # Tambahan informasi tentang metode yang digunakan
        print("\n🔬 METODE YANG DIGUNAKAN:")
        print("   • ORB (Oriented FAST and Rotated BRIEF) untuk ekstraksi fitur")
        print("   • Brute-Force Matcher dengan Hamming distance")
        print("   • Lowe's ratio test (0.75) untuk filtering matches")
        print("   • Threshold kemiripan: 30% keypoints match")
        print("="*70)

def main():
    """Fungsi utama program"""
    print("="*70)
    print("SISTEM DETEKSI PLAGIARISME GAMBAR")
    print("METODE: ORB + BRUTE-FORCE MATCHER")
    print("="*70)
    print("Kelebihan metode ini:")
    print("  ✓ Robust terhadap rotasi, scaling, dan perubahan cahaya")
    print("  ✓ Mendeteksi gambar yang diedit/cropped")
    print("  ✓ Cepat dan efisien untuk dataset kecil")
    print("-"*70)
    
    # Input file path
    doc1_path = input("\nMasukkan path dokumen 1: ").strip()
    doc2_path = input("Masukkan path dokumen 2: ").strip()
    
    # Hapus tanda kutip
    doc1_path = doc1_path.strip('"').strip("'")
    doc2_path = doc2_path.strip('"').strip("'")
    
    # Periksa file
    if not os.path.exists(doc1_path):
        print(f"❌ Error: File tidak ditemukan: {doc1_path}")
        return
    
    if not os.path.exists(doc2_path):
        print(f"❌ Error: File tidak ditemukan: {doc2_path}")
        return
    
    # Inisialisasi detector
    detector = PlagiarismDetectorORB()
    
    try:
        # Proses dokumen
        print("\n📄 Tahap 1: Ekstraksi konten")
        print("-"*50)
        doc1_data = detector.process_document(doc1_path)
        doc2_data = detector.process_document(doc2_path)
        
        # Deteksi plagiarisme
        print("\n🔍 Tahap 2: Deteksi plagiarisme gambar")
        print("-"*50)
        result = detector.detect_plagiarism(doc1_data, doc2_data)
        
        # Tampilkan hasil
        detector.print_detailed_results(doc1_path, doc2_path, result)
        
        # Opsi visualisasi jika ada match
        if result['matches'] and result['best_match']:
            print("\n📸 Apakah Anda ingin memvisualisasikan matches terbaik? (y/n): ", end="")
            visualize = input().lower()
            
            if visualize == 'y':
                best = result['best_match']
                i1 = best['gambar_dokumen1'] - 1
                i2 = best['gambar_dokumen2'] - 1
                
                img1 = doc1_data['images'][i1]
                img2 = doc2_data['images'][i2]
                
                # Ekstrak ulang features untuk visualisasi
                kp1, desc1 = detector.extract_features_orb(img1)
                kp2, desc2 = detector.extract_features_orb(img2)
                good_matches, _ = detector.match_features_bf(desc1, desc2)
                
                # Buat visualisasi
                vis_img = detector.visualize_matches(img1, img2, good_matches[:30], kp1, kp2)
                
                # Tampilkan
                cv2.imshow("ORB Matches - Best Match", vis_img)
                cv2.waitKey(0)
                cv2.destroyAllWindows()
        
    except Exception as e:
        print(f"\n❌ Terjadi kesalahan: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    # Cek dependencies
    required_packages = ['cv2', 'numpy', 'PyPDF2', 'docx', 'PIL']
    missing = []
    
    try:
        import cv2
    except ImportError:
        missing.append('opencv-python')
    
    try:
        import numpy as np
    except ImportError:
        missing.append('numpy')
    
    try:
        import PyPDF2
    except ImportError:
        missing.append('PyPDF2')
    
    try:
        import docx
    except ImportError:
        missing.append('python-docx')
    
    try:
        from PIL import Image
    except ImportError:
        missing.append('Pillow')
    
    if missing:
        print("❌ Install package yang diperlukan:")
        print(f"pip install {' '.join(missing)}")
    else:
        main()