Macfa
diff --git a/‎expose/README.md‎
Lines changed: 107 additions & 0 deletions b/‎expose/README.md‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎expose/content_extractors/base_extractor.py‎
Lines changed: 156 additions & 0 deletions b/‎expose/content_extractors/base_extractor.py‎
Lines changed: 156 additions & 0 deletions
diff --git a/‎expose/content_extractors/extractor_factory.py‎
Lines changed: 31 additions & 0 deletions b/‎expose/content_extractors/extractor_factory.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎expose/content_extractors/hwp_extractor.py‎
Lines changed: 76 additions & 0 deletions b/‎expose/content_extractors/hwp_extractor.py‎
Lines changed: 76 additions & 0 deletions
@@ -0,0 +1,107 @@
+# 문서 핵심 데이터 추출 시스템
+
+다양한 문서 형식(PDF, Word, HWP)에서 핵심 내용을 추출하고 구조화된 형식으로 출력하는 시스템입니다.
+
+## 현재 구현 상태
+
+### 1. 파일 읽기 모듈
+- PDF 파일 읽기 (`PDFReader`)
+- Word 문서 읽기 (`WordReader`)
+- 한글 문서 읽기 (`HWPReader`)
+
+### 2. 파일 내보내기 모듈
+- Excel 파일 내보내기 (`ExcelWriter`)
+- PDF 파일 내보내기 (`PDFWriter`)
+
+### 3. 핵심 내용 추출 모듈
+- 기본 추출기 (`BaseExtractor`)
+  - TextRank 알고리즘
+  - BERT 기반 의미적 분석
+  - 위치 기반 가중치
+  - 도메인 특화 점수
+- PDF 특화 추출기 (`PDFExtractor`)
+  - 페이지 번호 제거
+  - 헤더/푸터 제거
+  - 표 구조 분석
+- Word 특화 추출기 (`WordExtractor`)
+  - 스타일 태그 처리
+  - 특수 문자 정규화
+  - 목차 분석
+- 한글 특화 추출기 (`HWPExtractor`)
+  - 한글 특수 문자 처리
+  - 각주/미주 제거
+  - 단락 구조 분석
+
+### 4. 추출기 팩토리
+- 파일 확장자에 따른 적절한 추출기 선택
+- 확장 가능한 구조
+
+## 남은 작업
+
+### 1. OCR 통합
+- [ ] Tesseract OCR을 활용한 스캔본 문서 처리
+- [ ] 이미지 기반 텍스트 추출 최적화
+- [ ] 다국어 OCR 지원
+
+### 2. 성능 최적화
+- [ ] 병렬 처리 구현
+- [ ] 캐싱 시스템 도입
+- [ ] 메모리 사용량 최적화
+
+### 3. 사용자 경험 개선
+- [ ] 사용자 정의 패턴 등록 기능
+- [ ] 추출 결과 실시간 미리보기
+- [ ] 사용자 피드백 기반 학습
+
+### 4. 기능 확장
+- [ ] 실시간 협업 기능
+- [ ] 다국어 지원 강화
+- [ ] 도메인 특화 모델 추가
+
+## 향후 방향성
+
+### 1. AI/ML 기반 고도화
+- 딥러닝 모델을 활용한 더 정교한 내용 추출
+- 사용자 피드백 기반 지속적 학습
+- 도메인 특화 모델 개발
+
+### 2. 확장성 강화
+- 새로운 문서 형식 지원
+- 클라우드 기반 확장
+- 마이크로서비스 아키텍처 도입
+
+### 3. 보안 강화
+- 문서 암호화 지원
+- 접근 제어 구현
+- 감사 로그 시스템 구축
+
+### 4. 통합성 향상
+- API 서비스 제공
+- 다양한 플랫폼 연동
+- 표준 형식 지원 확대
+
+## 설치 및 사용 방법
+
+1. 의존성 설치:
+```bash
+pip install -r requirements.txt
+```
+
+2. 기본 사용 예시:
+```python
+from document_processor import DocumentProcessor
+
+processor = DocumentProcessor()
+extracted_data = processor.process_file("input.pdf")
+processor.export_file(extracted_data, "output.xlsx")
+```
+
+## 라이선스
+MIT License
+
+## 기여 방법
+1. Fork the repository
+2. Create your feature branch
+3. Commit your changes
+4. Push to the branch
+5. Create a new Pull Request 
@@ -0,0 +1,156 @@
+from abc import ABC, abstractmethod
+from typing import List, Dict, Any
+import re
+from sentence_transformers import SentenceTransformer
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from transformers import BertModel, BertTokenizer
+import torch
+
+class BaseExtractor(ABC):
+    def __init__(self):
+        # 다국어 BERT 모델 초기화
+        self.model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
+        self.bert_model = BertModel.from_pretrained('klue/bert-base')
+        self.tokenizer = BertTokenizer.from_pretrained('klue/bert-base')
+        
+        # 도메인별 전문 용어 사전
+        self.domain_keywords = {
+            'legal': ['제1조', '당사자', '계약기간', '의무', '권리'],
+            'medical': ['투여량', '부작용', '진단코드', '처방', '증상'],
+            'technical': ['사양', '구성', '설치', '운영', '관리']
+        }
+    
+    @abstractmethod
+    def preprocess_text(self, text: str) -> str:
+        """문서 타입별 전처리 메서드"""
+        pass
+    
+    def calculate_sentence_importance(self, sentence: str, position: int, total_sentences: int) -> float:
+        """
+        문장의 중요도를 계산합니다.
+        
+        Args:
+            sentence (str): 분석할 문장
+            position (int): 문장의 위치
+            total_sentences (int): 전체 문장 수
+            
+        Returns:
+            float: 문장 중요도 점수
+        """
+        # TextRank 점수
+        textrank_score = self._calculate_textrank_score(sentence)
+        
+        # BERT 임베딩 기반 의미적 중요도
+        semantic_score = self._calculate_semantic_score(sentence)
+        
+        # 위치 기반 가중치 (문서의 시작과 끝 부분에 더 높은 가중치)
+        position_weight = self._calculate_position_weight(position, total_sentences)
+        
+        # 도메인 특화 점수
+        domain_score = self._calculate_domain_score(sentence)
+        
+        # 종합 점수 계산 (가중치: TextRank 0.3, BERT 0.3, 위치 0.2, 도메인 0.2)
+        total_score = (textrank_score * 0.3 + 
+                      semantic_score * 0.3 + 
+                      position_weight * 0.2 + 
+                      domain_score * 0.2)
+        
+        return total_score
+    
+    def _calculate_textrank_score(self, sentence: str) -> float:
+        """TextRank 알고리즘을 적용한 중요도 점수 계산"""
+        # 구현은 기존 코드와 동일
+        pass
+    
+    def _calculate_semantic_score(self, sentence: str) -> float:
+        """BERT 기반 의미적 중요도 점수 계산"""
+        inputs = self.tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
+        with torch.no_grad():
+            outputs = self.bert_model(**inputs)
+        
+        # [CLS] 토큰의 임베딩을 문장 표현으로 사용
+        sentence_embedding = outputs.last_hidden_state[:, 0, :].numpy()
+        
+        # 의미적 중요도 점수 계산 (예: 다른 문장들과의 평균 유사도)
+        return np.mean(cosine_similarity(sentence_embedding, self.sentence_embeddings))
+    
+    def _calculate_position_weight(self, position: int, total: int) -> float:
+        """문장 위치 기반 가중치 계산"""
+        # 문서의 시작과 끝 부분에 더 높은 가중치
+        normalized_position = position / total
+        if normalized_position < 0.1 or normalized_position > 0.9:
+            return 1.0
+        elif normalized_position < 0.2 or normalized_position > 0.8:
+            return 0.8
+        else:
+            return 0.5
+    
+    def _calculate_domain_score(self, sentence: str) -> float:
+        """도메인 특화 점수 계산"""
+        max_score = 0.0
+        for domain, keywords in self.domain_keywords.items():
+            score = sum(1 for keyword in keywords if keyword in sentence)
+            max_score = max(max_score, score / len(keywords))
+        return max_score
+    
+    def extract_key_sentences(self, text: str, top_n: int = 5) -> List[str]:
+        """
+        텍스트에서 핵심 문장을 추출합니다.
+        
+        Args:
+            text (str): 원본 텍스트
+            top_n (int): 추출할 문장 수
+            
+        Returns:
+            List[str]: 핵심 문장 리스트
+        """
+        # 문장 분리
+        sentences = re.split(r'[.!?]+', text)
+        sentences = [s.strip() for s in sentences if len(s.strip()) > 0]
+        
+        # 문장 임베딩 생성
+        self.sentence_embeddings = self.model.encode(sentences)
+        
+        # 각 문장의 중요도 계산
+        sentence_scores = []
+        for i, sentence in enumerate(sentences):
+            score = self.calculate_sentence_importance(sentence, i, len(sentences))
+            sentence_scores.append((sentence, score))
+        
+        # 중요도 순으로 정렬
+        ranked_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
+        return [sentence for sentence, _ in ranked_sentences[:top_n]]
+    
+    def extract_keywords(self, text: str, top_n: int = 10) -> Dict[str, float]:
+        """
+        텍스트에서 키워드를 추출합니다.
+        
+        Args:
+            text (str): 원본 텍스트
+            top_n (int): 추출할 키워드 수
+            
+        Returns:
+            Dict[str, float]: 키워드와 중요도 점수
+        """
+        # 단어 분리 및 정규화
+        words = re.findall(r'\w+', text.lower())
+        word_freq = {}
+        
+        # 단어 빈도수 계산
+        for word in words:
+            if len(word) > 1:  # 1글자 단어 제외
+                word_freq[word] = word_freq.get(word, 0) + 1
+        
+        # TF-IDF 스타일 점수 계산
+        max_freq = max(word_freq.values())
+        keywords = {word: freq/max_freq for word, freq in word_freq.items()}
+        
+        # 도메인 특화 키워드 가중치 적용
+        for word in keywords:
+            for domain_keywords in self.domain_keywords.values():
+                if word in domain_keywords:
+                    keywords[word] *= 1.5  # 도메인 키워드 가중치 증가
+        
+        # 상위 키워드 선택
+        return dict(sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:top_n]) 
@@ -0,0 +1,31 @@
+from typing import Type
+from .base_extractor import BaseExtractor
+from .pdf_extractor import PDFExtractor
+from .word_extractor import WordExtractor
+from .hwp_extractor import HWPExtractor
+
+class ExtractorFactory:
+    @staticmethod
+    def get_extractor(file_extension: str) -> Type[BaseExtractor]:
+        """
+        파일 확장자에 따라 적절한 추출기를 반환합니다.
+        
+        Args:
+            file_extension (str): 파일 확장자 (예: '.pdf', '.docx', '.hwp')
+            
+        Returns:
+            Type[BaseExtractor]: 해당 파일 형식에 맞는 추출기 클래스
+            
+        Raises:
+            ValueError: 지원하지 않는 파일 형식인 경우
+        """
+        extractors = {
+            '.pdf': PDFExtractor,
+            '.docx': WordExtractor,
+            '.hwp': HWPExtractor
+        }
+        
+        if file_extension.lower() not in extractors:
+            raise ValueError(f"지원하지 않는 파일 형식입니다: {file_extension}")
+        
+        return extractors[file_extension.lower()] 
@@ -0,0 +1,76 @@
+from typing import List, Dict, Any
+import re
+from .base_extractor import BaseExtractor
+
+class HWPExtractor(BaseExtractor):
+    def preprocess_text(self, text: str) -> str:
+        """
+        한글 문서 텍스트 전처리
+        - 한글 특수 문자 처리
+        - 표 내용 정규화
+        - 각주/미주 제거
+        """
+        # 한글 특수 문자 처리
+        text = re.sub(r'[\u1100-\u11FF\u3130-\u318F\uA960-\uA97F\uD7B0-\uD7FF]', '', text)  # 한글 자모 제거
+        text = re.sub(r'[\uFF00-\uFFEF]', '', text)  # 반각 문자 제거
+        
+        # 각주/미주 패턴 제거
+        text = re.sub(r'\[각주\d+\].*?\[/각주\]', '', text)
+        text = re.sub(r'\[미주\d+\].*?\[/미주\]', '', text)
+        
+        # 표 내용 정규화
+        text = re.sub(r'┌[─┬┐]+┐', '', text)  # 표 상단 테두리 제거
+        text = re.sub(r'├[─┼┤]+┤', '', text)  # 표 중간 테두리 제거
+        text = re.sub(r'└[─┴┘]+┘', '', text)  # 표 하단 테두리 제거
+        text = re.sub(r'│', '|', text)  # 세로선 정규화
+        
+        return text.strip()
+    
+    def extract_structure(self, text: str) -> Dict[str, Any]:
+        """
+        한글 문서의 구조를 추출합니다.
+        - 한글 스타일 기반 제목 식별
+        - 표 내용 추출
+        - 단락 구조 분석
+        """
+        structure = {
+            'headings': [],
+            'tables': [],
+            'paragraphs': [],
+            'content': []
+        }
+        
+        lines = text.split('\n')
+        current_paragraph = []
+        
+        for line in lines:
+            # 제목 패턴 (한글 제목 스타일)
+            if re.match(r'^[가-힣]+\s*[0-9]*\.?\s*$', line.strip()):
+                if current_paragraph:
+                    structure['paragraphs'].append('\n'.join(current_paragraph))
+                    current_paragraph = []
+                structure['headings'].append(line.strip())
+            
+            # 표 내용 (|로 구분된 경우)
+            elif '|' in line:
+                structure['tables'].append(line.strip())
+            
+            # 단락 구분 (빈 줄)
+            elif not line.strip():
+                if current_paragraph:
+                    structure['paragraphs'].append('\n'.join(current_paragraph))
+                    current_paragraph = []
+            
+            # 일반 텍스트
+            else:
+                current_paragraph.append(line)
+        
+        if current_paragraph:
+            structure['paragraphs'].append('\n'.join(current_paragraph))
+        
+        # 내용 추출 (단락을 문장 단위로 분리)
+        for para in structure['paragraphs']:
+            sentences = re.split(r'[.!?]+', para)
+            structure['content'].extend([s.strip() for s in sentences if s.strip()])
+        
+        return structure