Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions packages/markitdown/setup.cfg
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@

[metadata]
name = openize-markitdown-python
version = 25.4.0
version = 25.5.0

author = Openize
author_email = packages@openize.com
description = A document converter for Word, PDF, Excel, and PowerPoint to Markdown.
long_description = file:README.md
long_description = file:README.md
long_description_content_type = text/markdown
license = MIT
license_files = LICENSE
Expand All @@ -24,14 +25,15 @@ classifiers =

[options]
package_dir =
= src
= src
packages = find_namespace:
python_requires = >=3.7
install_requires =
aspose-words>=23.0.0
aspose-cells-python>=23.0.0
aspose-slides>=23.0.0
openai>=1.0.0
anthropic>=3.0.0

[options.packages.find]
where = src
Expand Down
4 changes: 3 additions & 1 deletion packages/markitdown/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ def install_if_missing(package, module_name=None):
dependencies = [
("aspose-cells-python", "asposecellspython"),
("aspose-words", "asposewords"),
("aspose-slides", "asposeslides")
("aspose-slides", "asposeslides"),
("openai", "openai"),
("anthropic", "anthropic"),
]

# Install missing dependencies before proceeding
Expand Down
28 changes: 21 additions & 7 deletions packages/markitdown/src/openize/markitdown/core.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,34 @@
import os

from processor import DocumentProcessor

from llm_strategy import LLMFactory, SaveLocally
import logging

class MarkItDown:
def __init__(self, output_dir):
def __init__(self, output_dir, llm_client_name=None):
self.output_dir = output_dir
self.llm_client_name = llm_client_name
self.llm_client = None

def convert_document(self, input_file, insert_into_llm=False):
if llm_client_name:
try:
self.llm_client = LLMFactory.get_llm(llm_client_name)
except ValueError as e:
logging.error(f"LLM client error: {e}")
self.llm_client = SaveLocally()
else:
self.llm_client = SaveLocally()

def convert_document(self, input_file):
"""Run the document conversion process."""
processor = DocumentProcessor(self.output_dir)
processor.process_document(input_file, insert_into_llm)
md_file = processor.process_document(input_file)

if md_file and self.llm_client:
self.llm_client.process(md_file)

def convert_directory(self, input_dir: str, insert_into_llm: bool = False):
def convert_directory(self, input_dir: str):
supported_exts = [".docx", ".pdf", ".xlsx", ".pptx"]
for filename in os.listdir(input_dir):
filepath = os.path.join(input_dir, filename)
if os.path.isfile(filepath) and os.path.splitext(filename)[1].lower() in supported_exts:
self.convert_document(filepath, insert_into_llm)
self.convert_document(filepath)
51 changes: 47 additions & 4 deletions packages/markitdown/src/openize/markitdown/llm_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
from abc import ABC, abstractmethod
import openai

# Placeholder for Claude SDK import
# from claude_sdk import ClaudeClient as ClaudeAPIClient

class LLMStrategy(ABC):
@abstractmethod
def process(self, md_file):
Expand All @@ -12,10 +15,10 @@ class SaveLocally(LLMStrategy):
def process(self, md_file):
logging.info(f"File saved locally: {md_file}")

class InsertIntoLLM(LLMStrategy):
class OpenAIClient(LLMStrategy):
def __init__(self):
self.api_key = os.getenv("OPENAI_API_KEY") # Read from environment
self.model = os.getenv("OPENAI_MODEL", "gpt-4") # Default model if not set
self.api_key = os.getenv("OPENAI_API_KEY")
self.model = os.getenv("OPENAI_MODEL", "gpt-4")

if not self.api_key:
raise ValueError("Missing OpenAI API key. Please set it in the environment.")
Expand All @@ -40,11 +43,51 @@ def process(self, md_file):
)

llm_response = response.choices[0].message.content
logging.info(f"LLM Response for {md_file}: {llm_response}")
logging.info(f"OpenAI Response for {md_file}: {llm_response}")

except FileNotFoundError:
logging.error(f"Markdown file not found: {md_file}")
except openai.OpenAIError as e:
logging.error(f"OpenAI API error while processing {md_file}: {e}")
except Exception as e:
logging.exception(f"Unexpected error processing {md_file}: {e}")

class ClaudeClient(LLMStrategy):
def __init__(self):
self.api_key = os.getenv("CLAUDE_API_KEY")
self.model = os.getenv("CLAUDE_MODEL", "claude-v1")

if not self.api_key:
raise ValueError("Missing Claude API key. Please set it in the environment.")

# Initialize Claude client here (replace with actual SDK code)
# self.client = ClaudeAPIClient(api_key=self.api_key)

def process(self, md_file):
try:
with open(md_file, "r", encoding="utf-8") as file:
content = file.read()

# Replace with actual Claude API call
# response = self.client.complete(prompt=content, model=self.model)

# Dummy placeholder response
response_text = f"Simulated Claude response for {md_file}"

logging.info(f"Claude Response for {md_file}: {response_text}")

except FileNotFoundError:
logging.error(f"Markdown file not found: {md_file}")
except Exception as e:
logging.exception(f"Unexpected error processing {md_file}: {e}")

class LLMFactory:
@staticmethod
def get_llm(client_name: str) -> LLMStrategy:
client_name = client_name.lower()
if client_name == "openai":
return OpenAIClient()
elif client_name == "claude":
return ClaudeClient()
else:
raise ValueError(f"Unknown LLM client: {client_name}")
24 changes: 14 additions & 10 deletions packages/markitdown/src/openize/markitdown/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from core import MarkItDown
from license_manager import LicenseManager


def ask_user_boolean(question):
"""Ask the user a yes/no question and return True/False."""
while True:
Expand All @@ -17,7 +16,6 @@ def ask_user_boolean(question):
else:
print("Invalid input. Please enter 'yes' or 'no'.")


def ensure_env_variable(var_name, prompt_message, default=None):
"""Ensure an environment variable is set, otherwise ask the user and persist it."""
value = os.getenv(var_name)
Expand All @@ -31,7 +29,6 @@ def ensure_env_variable(var_name, prompt_message, default=None):

return value


def set_env_variable(var_name, value):
"""Set an environment variable persistently on Windows and Linux/macOS."""
os.environ[var_name] = value # Set for the current session
Expand All @@ -42,7 +39,6 @@ def set_env_variable(var_name, value):
os.system(f'echo "export {var_name}={value}" >> ~/.bashrc')
os.system(f'echo "export {var_name}={value}" >> ~/.profile')


def main():
"""Entry point for the CLI tool."""
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
Expand All @@ -52,7 +48,8 @@ def main():
input_group.add_argument("--input-file", help="Path to the input document (PDF, Word, etc.)")
input_group.add_argument("--input-dir", help="Path to a directory containing supported documents")
parser.add_argument("-o", "--output-dir", required=True, help="Directory to save the converted Markdown file(s)")
parser.add_argument("--insert-into-llm", action="store_true", help="Insert output into LLM")
parser.add_argument("--llm", choices=["none", "openai", "claude"], default="none",
help="Choose LLM client to process output (none, openai, claude)")

args = parser.parse_args()

Expand All @@ -64,19 +61,26 @@ def main():
LicenseManager().apply_license()

# Setup LLM credentials only if required
if args.insert_into_llm:
if args.llm == "openai":
ensure_env_variable("OPENAI_API_KEY", "Enter your OpenAI API key: ")
ensure_env_variable("OPENAI_MODEL", "Enter OpenAI model name (default: gpt-4): ", default="gpt-4")
elif args.llm == "claude":
ensure_env_variable("CLAUDE_API_KEY", "Enter your Claude API key: ")
ensure_env_variable("CLAUDE_MODEL", "Enter Claude model name (default: claude-v1): ", default="claude-v1")

# Run conversion for either a single file or a directory
markitdown = MarkItDown(args.output_dir)
# Initialize MarkItDown with selected LLM
llm_client_name = args.llm if args.llm != "none" else None
markitdown = MarkItDown(args.output_dir, llm_client_name)

# Run conversion for either a single file or a directory
if args.input_file:
markitdown.convert_document(args.input_file, args.insert_into_llm)
markitdown.convert_document(args.input_file)
elif args.input_dir:
markitdown.convert_directory(args.input_dir, args.insert_into_llm)
markitdown.convert_directory(args.input_dir)

except Exception as e:
logging.error(f"Error: {e}", exc_info=True)
sys.exit(1)

if __name__ == "__main__":
main()
70 changes: 53 additions & 17 deletions packages/markitdown/tests/test.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import pytest
from pathlib import Path
import os

from ..src.openize.markitdown.converters import WordConverter, PDFConverter, ExcelConverter, PowerPointConverter
from ..src.openize.markitdown.factory import ConverterFactory
from ..src.openize.markitdown.llm_strategy import SaveLocally, InsertIntoLLM
from ..src.openize.markitdown.llm_strategy import SaveLocally, LLMFactory, OpenAIClient, ClaudeClient
from ..src.openize.markitdown.processor import DocumentProcessor
import os


@pytest.fixture
def sample_output_dir():
Expand All @@ -18,49 +20,83 @@ def sample_md_file(sample_output_dir):
md_file.write_text("# Sample Markdown File\n\nThis is a test.")
return md_file

# Test Converters
def test_word_converter(sample_output_dir):

# --------- Converter Tests ---------

def test_word_converter():
converter = WordConverter()
assert converter is not None

def test_pdf_converter(sample_output_dir):
def test_pdf_converter():
converter = PDFConverter()
assert converter is not None

def test_excel_converter(sample_output_dir):
def test_excel_converter():
converter = ExcelConverter()
assert converter is not None

def test_ppt_converter(sample_output_dir):
def test_ppt_converter():
converter = PowerPointConverter()
assert converter is not None

# Test ConverterFactory

# --------- Factory Tests ---------

def test_converter_factory():
assert isinstance(ConverterFactory.get_converter(".docx"), WordConverter)
assert isinstance(ConverterFactory.get_converter(".pdf"), PDFConverter)
assert isinstance(ConverterFactory.get_converter(".xlsx"), ExcelConverter)
assert isinstance(ConverterFactory.get_converter(".pptx"), PowerPointConverter)


# Test LLM Strategy
# --------- Strategy Pattern Tests ---------

def test_save_locally(sample_md_file):
strategy = SaveLocally()
strategy.process(sample_md_file)
assert sample_md_file.exists()

def test_insert_into_llm(mocker, sample_md_file):
mocker.patch("openai.ChatCompletion.create", return_value={"choices": [{"message": {"content": "LLM Response"}}]})
strategy = InsertIntoLLM()
def test_insert_into_llm_openai(mocker, sample_md_file):
mocker.patch("openai.ChatCompletion.create", return_value={
"choices": [{"message": {"content": "Mocked OpenAI Response"}}]
})
strategy = OpenAIClient(provider="openai")
strategy.process(sample_md_file)

def test_insert_into_llm_claude(mocker, sample_md_file):
mock_anthropic = mocker.patch("openize.markitdown.llm_strategy.Anthropic")
mock_client = mock_anthropic.return_value
mock_client.messages.create.return_value.content = "Mocked Claude Response"
strategy = ClaudeClient(provider="claude")
strategy.process(sample_md_file)

# Test DocumentProcessor
def test_document_processor(mocker, sample_output_dir):
mocker.patch("packages.src.openize.markitdown.factory.ConverterFactory.get_converter", return_value=WordConverter())

# --------- Document Processor Tests ---------

def test_document_processor_local_conversion(mocker, sample_output_dir):
mock_converter = mocker.patch("openize.markitdown.factory.ConverterFactory.get_converter", return_value=WordConverter())
processor = DocumentProcessor(output_dir=sample_output_dir)
processor.process_document("sample.docx", insert_into_llm=False)
output_file = sample_output_dir / "sample.md"
assert output_file.exists()

if __name__ == "__main__":
pytest.main()
def test_document_processor_with_llm_openai(mocker, sample_output_dir):
mock_converter = mocker.patch("openize.markitdown.factory.ConverterFactory.get_converter", return_value=WordConverter())
mocker.patch("openai.ChatCompletion.create", return_value={
"choices": [{"message": {"content": "LLM Output"}}]
})
processor = DocumentProcessor(output_dir=sample_output_dir)
processor.process_document("sample.docx", insert_into_llm=True, llm_provider="openai")
output_file = sample_output_dir / "sample.md"
assert output_file.exists()

def test_document_processor_with_llm_claude(mocker, sample_output_dir):
mock_converter = mocker.patch("openize.markitdown.factory.ConverterFactory.get_converter", return_value=WordConverter())
mock_anthropic = mocker.patch("openize.markitdown.llm_strategy.Anthropic")
mock_client = mock_anthropic.return_value
mock_client.messages.create.return_value.content = "LLM Claude Output"
processor = DocumentProcessor(output_dir=sample_output_dir)
processor.process_document("sample.docx", insert_into_llm=True, llm_provider="claude")
output_file = sample_output_dir / "sample.md"
assert output_file.exists()