diff --git a/packages/markitdown/setup.cfg b/packages/markitdown/setup.cfg index 8d2203a..b7ab0be 100644 --- a/packages/markitdown/setup.cfg +++ b/packages/markitdown/setup.cfg @@ -1,11 +1,12 @@ [metadata] name = openize-markitdown-python -version = 25.4.0 +version = 25.5.0 + author = Openize author_email = packages@openize.com description = A document converter for Word, PDF, Excel, and PowerPoint to Markdown. -long_description = file:README.md +long_description = file:README.md long_description_content_type = text/markdown license = MIT license_files = LICENSE @@ -24,7 +25,7 @@ classifiers = [options] package_dir = - = src + = src packages = find_namespace: python_requires = >=3.7 install_requires = @@ -32,6 +33,7 @@ install_requires = aspose-cells-python>=23.0.0 aspose-slides>=23.0.0 openai>=1.0.0 + anthropic>=3.0.0 [options.packages.find] where = src diff --git a/packages/markitdown/setup.py b/packages/markitdown/setup.py index e13c461..ed3e168 100644 --- a/packages/markitdown/setup.py +++ b/packages/markitdown/setup.py @@ -18,7 +18,9 @@ def install_if_missing(package, module_name=None): dependencies = [ ("aspose-cells-python", "asposecellspython"), ("aspose-words", "asposewords"), - ("aspose-slides", "asposeslides") + ("aspose-slides", "asposeslides"), + ("openai", "openai"), + ("anthropic", "anthropic"), ] # Install missing dependencies before proceeding diff --git a/packages/markitdown/src/openize/markitdown/core.py b/packages/markitdown/src/openize/markitdown/core.py index 2b9213f..b1b24a9 100644 --- a/packages/markitdown/src/openize/markitdown/core.py +++ b/packages/markitdown/src/openize/markitdown/core.py @@ -1,20 +1,34 @@ import os - from processor import DocumentProcessor - +from llm_strategy import LLMFactory, SaveLocally +import logging class MarkItDown: - def __init__(self, output_dir): + def __init__(self, output_dir, llm_client_name=None): self.output_dir = output_dir + self.llm_client_name = llm_client_name + self.llm_client = None - def convert_document(self, input_file, insert_into_llm=False): + if llm_client_name: + try: + self.llm_client = LLMFactory.get_llm(llm_client_name) + except ValueError as e: + logging.error(f"LLM client error: {e}") + self.llm_client = SaveLocally() + else: + self.llm_client = SaveLocally() + + def convert_document(self, input_file): """Run the document conversion process.""" processor = DocumentProcessor(self.output_dir) - processor.process_document(input_file, insert_into_llm) + md_file = processor.process_document(input_file) + + if md_file and self.llm_client: + self.llm_client.process(md_file) - def convert_directory(self, input_dir: str, insert_into_llm: bool = False): + def convert_directory(self, input_dir: str): supported_exts = [".docx", ".pdf", ".xlsx", ".pptx"] for filename in os.listdir(input_dir): filepath = os.path.join(input_dir, filename) if os.path.isfile(filepath) and os.path.splitext(filename)[1].lower() in supported_exts: - self.convert_document(filepath, insert_into_llm) + self.convert_document(filepath) diff --git a/packages/markitdown/src/openize/markitdown/llm_strategy.py b/packages/markitdown/src/openize/markitdown/llm_strategy.py index bc101cf..f9c7ee4 100644 --- a/packages/markitdown/src/openize/markitdown/llm_strategy.py +++ b/packages/markitdown/src/openize/markitdown/llm_strategy.py @@ -3,6 +3,9 @@ from abc import ABC, abstractmethod import openai +# Placeholder for Claude SDK import +# from claude_sdk import ClaudeClient as ClaudeAPIClient + class LLMStrategy(ABC): @abstractmethod def process(self, md_file): @@ -12,10 +15,10 @@ class SaveLocally(LLMStrategy): def process(self, md_file): logging.info(f"File saved locally: {md_file}") -class InsertIntoLLM(LLMStrategy): +class OpenAIClient(LLMStrategy): def __init__(self): - self.api_key = os.getenv("OPENAI_API_KEY") # Read from environment - self.model = os.getenv("OPENAI_MODEL", "gpt-4") # Default model if not set + self.api_key = os.getenv("OPENAI_API_KEY") + self.model = os.getenv("OPENAI_MODEL", "gpt-4") if not self.api_key: raise ValueError("Missing OpenAI API key. Please set it in the environment.") @@ -40,7 +43,7 @@ def process(self, md_file): ) llm_response = response.choices[0].message.content - logging.info(f"LLM Response for {md_file}: {llm_response}") + logging.info(f"OpenAI Response for {md_file}: {llm_response}") except FileNotFoundError: logging.error(f"Markdown file not found: {md_file}") @@ -48,3 +51,43 @@ def process(self, md_file): logging.error(f"OpenAI API error while processing {md_file}: {e}") except Exception as e: logging.exception(f"Unexpected error processing {md_file}: {e}") + +class ClaudeClient(LLMStrategy): + def __init__(self): + self.api_key = os.getenv("CLAUDE_API_KEY") + self.model = os.getenv("CLAUDE_MODEL", "claude-v1") + + if not self.api_key: + raise ValueError("Missing Claude API key. Please set it in the environment.") + + # Initialize Claude client here (replace with actual SDK code) + # self.client = ClaudeAPIClient(api_key=self.api_key) + + def process(self, md_file): + try: + with open(md_file, "r", encoding="utf-8") as file: + content = file.read() + + # Replace with actual Claude API call + # response = self.client.complete(prompt=content, model=self.model) + + # Dummy placeholder response + response_text = f"Simulated Claude response for {md_file}" + + logging.info(f"Claude Response for {md_file}: {response_text}") + + except FileNotFoundError: + logging.error(f"Markdown file not found: {md_file}") + except Exception as e: + logging.exception(f"Unexpected error processing {md_file}: {e}") + +class LLMFactory: + @staticmethod + def get_llm(client_name: str) -> LLMStrategy: + client_name = client_name.lower() + if client_name == "openai": + return OpenAIClient() + elif client_name == "claude": + return ClaudeClient() + else: + raise ValueError(f"Unknown LLM client: {client_name}") diff --git a/packages/markitdown/src/openize/markitdown/main.py b/packages/markitdown/src/openize/markitdown/main.py index 0c2fe80..14bc3c4 100644 --- a/packages/markitdown/src/openize/markitdown/main.py +++ b/packages/markitdown/src/openize/markitdown/main.py @@ -5,7 +5,6 @@ from core import MarkItDown from license_manager import LicenseManager - def ask_user_boolean(question): """Ask the user a yes/no question and return True/False.""" while True: @@ -17,7 +16,6 @@ def ask_user_boolean(question): else: print("Invalid input. Please enter 'yes' or 'no'.") - def ensure_env_variable(var_name, prompt_message, default=None): """Ensure an environment variable is set, otherwise ask the user and persist it.""" value = os.getenv(var_name) @@ -31,7 +29,6 @@ def ensure_env_variable(var_name, prompt_message, default=None): return value - def set_env_variable(var_name, value): """Set an environment variable persistently on Windows and Linux/macOS.""" os.environ[var_name] = value # Set for the current session @@ -42,7 +39,6 @@ def set_env_variable(var_name, value): os.system(f'echo "export {var_name}={value}" >> ~/.bashrc') os.system(f'echo "export {var_name}={value}" >> ~/.profile') - def main(): """Entry point for the CLI tool.""" logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") @@ -52,7 +48,8 @@ def main(): input_group.add_argument("--input-file", help="Path to the input document (PDF, Word, etc.)") input_group.add_argument("--input-dir", help="Path to a directory containing supported documents") parser.add_argument("-o", "--output-dir", required=True, help="Directory to save the converted Markdown file(s)") - parser.add_argument("--insert-into-llm", action="store_true", help="Insert output into LLM") + parser.add_argument("--llm", choices=["none", "openai", "claude"], default="none", + help="Choose LLM client to process output (none, openai, claude)") args = parser.parse_args() @@ -64,19 +61,26 @@ def main(): LicenseManager().apply_license() # Setup LLM credentials only if required - if args.insert_into_llm: + if args.llm == "openai": ensure_env_variable("OPENAI_API_KEY", "Enter your OpenAI API key: ") ensure_env_variable("OPENAI_MODEL", "Enter OpenAI model name (default: gpt-4): ", default="gpt-4") + elif args.llm == "claude": + ensure_env_variable("CLAUDE_API_KEY", "Enter your Claude API key: ") + ensure_env_variable("CLAUDE_MODEL", "Enter Claude model name (default: claude-v1): ", default="claude-v1") - # Run conversion for either a single file or a directory - markitdown = MarkItDown(args.output_dir) + # Initialize MarkItDown with selected LLM + llm_client_name = args.llm if args.llm != "none" else None + markitdown = MarkItDown(args.output_dir, llm_client_name) + # Run conversion for either a single file or a directory if args.input_file: - markitdown.convert_document(args.input_file, args.insert_into_llm) + markitdown.convert_document(args.input_file) elif args.input_dir: - markitdown.convert_directory(args.input_dir, args.insert_into_llm) + markitdown.convert_directory(args.input_dir) except Exception as e: logging.error(f"Error: {e}", exc_info=True) sys.exit(1) +if __name__ == "__main__": + main() diff --git a/packages/markitdown/tests/test.py b/packages/markitdown/tests/test.py index db6c4c8..601b67e 100644 --- a/packages/markitdown/tests/test.py +++ b/packages/markitdown/tests/test.py @@ -1,10 +1,12 @@ import pytest from pathlib import Path +import os + from ..src.openize.markitdown.converters import WordConverter, PDFConverter, ExcelConverter, PowerPointConverter from ..src.openize.markitdown.factory import ConverterFactory -from ..src.openize.markitdown.llm_strategy import SaveLocally, InsertIntoLLM +from ..src.openize.markitdown.llm_strategy import SaveLocally, LLMFactory, OpenAIClient, ClaudeClient from ..src.openize.markitdown.processor import DocumentProcessor -import os + @pytest.fixture def sample_output_dir(): @@ -18,24 +20,28 @@ def sample_md_file(sample_output_dir): md_file.write_text("# Sample Markdown File\n\nThis is a test.") return md_file -# Test Converters -def test_word_converter(sample_output_dir): + +# --------- Converter Tests --------- + +def test_word_converter(): converter = WordConverter() assert converter is not None -def test_pdf_converter(sample_output_dir): +def test_pdf_converter(): converter = PDFConverter() assert converter is not None -def test_excel_converter(sample_output_dir): +def test_excel_converter(): converter = ExcelConverter() assert converter is not None -def test_ppt_converter(sample_output_dir): +def test_ppt_converter(): converter = PowerPointConverter() assert converter is not None -# Test ConverterFactory + +# --------- Factory Tests --------- + def test_converter_factory(): assert isinstance(ConverterFactory.get_converter(".docx"), WordConverter) assert isinstance(ConverterFactory.get_converter(".pdf"), PDFConverter) @@ -43,24 +49,54 @@ def test_converter_factory(): assert isinstance(ConverterFactory.get_converter(".pptx"), PowerPointConverter) -# Test LLM Strategy +# --------- Strategy Pattern Tests --------- + def test_save_locally(sample_md_file): strategy = SaveLocally() strategy.process(sample_md_file) assert sample_md_file.exists() -def test_insert_into_llm(mocker, sample_md_file): - mocker.patch("openai.ChatCompletion.create", return_value={"choices": [{"message": {"content": "LLM Response"}}]}) - strategy = InsertIntoLLM() +def test_insert_into_llm_openai(mocker, sample_md_file): + mocker.patch("openai.ChatCompletion.create", return_value={ + "choices": [{"message": {"content": "Mocked OpenAI Response"}}] + }) + strategy = OpenAIClient(provider="openai") + strategy.process(sample_md_file) + +def test_insert_into_llm_claude(mocker, sample_md_file): + mock_anthropic = mocker.patch("openize.markitdown.llm_strategy.Anthropic") + mock_client = mock_anthropic.return_value + mock_client.messages.create.return_value.content = "Mocked Claude Response" + strategy = ClaudeClient(provider="claude") strategy.process(sample_md_file) -# Test DocumentProcessor -def test_document_processor(mocker, sample_output_dir): - mocker.patch("packages.src.openize.markitdown.factory.ConverterFactory.get_converter", return_value=WordConverter()) + +# --------- Document Processor Tests --------- + +def test_document_processor_local_conversion(mocker, sample_output_dir): + mock_converter = mocker.patch("openize.markitdown.factory.ConverterFactory.get_converter", return_value=WordConverter()) processor = DocumentProcessor(output_dir=sample_output_dir) processor.process_document("sample.docx", insert_into_llm=False) output_file = sample_output_dir / "sample.md" assert output_file.exists() -if __name__ == "__main__": - pytest.main() +def test_document_processor_with_llm_openai(mocker, sample_output_dir): + mock_converter = mocker.patch("openize.markitdown.factory.ConverterFactory.get_converter", return_value=WordConverter()) + mocker.patch("openai.ChatCompletion.create", return_value={ + "choices": [{"message": {"content": "LLM Output"}}] + }) + processor = DocumentProcessor(output_dir=sample_output_dir) + processor.process_document("sample.docx", insert_into_llm=True, llm_provider="openai") + output_file = sample_output_dir / "sample.md" + assert output_file.exists() + +def test_document_processor_with_llm_claude(mocker, sample_output_dir): + mock_converter = mocker.patch("openize.markitdown.factory.ConverterFactory.get_converter", return_value=WordConverter()) + mock_anthropic = mocker.patch("openize.markitdown.llm_strategy.Anthropic") + mock_client = mock_anthropic.return_value + mock_client.messages.create.return_value.content = "LLM Claude Output" + processor = DocumentProcessor(output_dir=sample_output_dir) + processor.process_document("sample.docx", insert_into_llm=True, llm_provider="claude") + output_file = sample_output_dir / "sample.md" + assert output_file.exists() +