ingest_files

Usage

from morphik import Morphik

db = Morphik()

# Batch ingest files with shared metadata
result = db.ingest_files(
    files=["document1.pdf", "document2.docx", "image.png"], 
    metadata={"category": "reports"},
    use_colpali=True,
    parallel=True
)

# Process the results
for doc in result["documents"]:
    print(f"Successfully ingested: {doc.filename} (ID: {doc.external_id})")

# Check for errors
for error in result["errors"]:
    print(f"Error ingesting {error.get('filename')}: {error.get('error')}")

from morphik import Morphik

db = Morphik()

# Batch ingest files with shared metadata
result = db.ingest_files(
    files=["document1.pdf", "document2.docx", "image.png"], 
    metadata={"category": "reports"},
    use_colpali=True,
    parallel=True
)

# Process the results
for doc in result["documents"]:
    print(f"Successfully ingested: {doc.filename} (ID: {doc.external_id})")

# Check for errors
for error in result["errors"]:
    print(f"Error ingesting {error.get('filename')}: {error.get('error')}")

from morphik import AsyncMorphik

async with AsyncMorphik() as db:
    # Batch ingest files with shared metadata
    result = await db.ingest_files(
        files=["document1.pdf", "document2.docx", "image.png"], 
        metadata={"category": "reports"},
        use_colpali=True,
        parallel=True
    )
    
    # Process the results
    for doc in result["documents"]:
        print(f"Successfully ingested: {doc.filename} (ID: {doc.external_id})")
    
    # Check for errors
    for error in result["errors"]:
        print(f"Error ingesting {error.get('filename')}: {error.get('error')}")

Parameters

files (List[Union[str, bytes, BinaryIO, Path]]): List of files to ingest (path strings, bytes, file objects, or Path objects)
metadata (Dict[str, Any] | List[Dict[str, Any]], optional): Metadata to apply to the files. Can be either:
- A single dict to apply to all files
- A list of dicts, one per file (must match the length of files)
rules (List, optional): Rules to apply during ingestion. Can be either:
- A single list of rules to apply to all files
- A list of rule lists, one per file
use_colpali (bool, optional): Whether to use ColPali-style embedding model. Defaults to True.
parallel (bool, optional): Whether to process files in parallel. Defaults to True.

Returns

An object containing:

documents: List of successfully ingested Document objects
errors: List of errors encountered during ingestion (each error is a dict with ‘filename’ and ‘error’ keys)

Advanced Examples

Per-File Metadata

# Ingest files with different metadata for each file
files = ["report.pdf", "data.csv", "presentation.pptx"]

# Metadata must match the length of files list
metadata_list = [
    {"category": "reports", "author": "Alice"},
    {"category": "data", "source": "database"},
    {"category": "presentations", "department": "marketing"}
]

result = db.ingest_files(
    files=files,
    metadata=metadata_list
)

# Ingest files with different metadata for each file
files = ["report.pdf", "data.csv", "presentation.pptx"]

# Metadata must match the length of files list
metadata_list = [
    {"category": "reports", "author": "Alice"},
    {"category": "data", "source": "database"},
    {"category": "presentations", "department": "marketing"}
]

result = db.ingest_files(
    files=files,
    metadata=metadata_list
)

# Ingest files with different metadata for each file
files = ["report.pdf", "data.csv", "presentation.pptx"]

# Metadata must match the length of files list
metadata_list = [
    {"category": "reports", "author": "Alice"},
    {"category": "data", "source": "database"},
    {"category": "presentations", "department": "marketing"}
]

result = await db.ingest_files(
    files=files,
    metadata=metadata_list
)

Per-File Rules

from morphik.rules import MetadataExtractionRule, NaturalLanguageRule
from pydantic import BaseModel

class ReportMetadata(BaseModel):
    title: str
    author: str
    
class DataMetadata(BaseModel):
    rows: int
    columns: int
    
# Define different rules for each file
rules_list = [
    [MetadataExtractionRule(schema=ReportMetadata)],
    [MetadataExtractionRule(schema=DataMetadata)],
    [NaturalLanguageRule(prompt="Extract key points only")]
]

files = ["report.pdf", "data.csv", "presentation.pptx"]

result = db.ingest_files(
    files=files,
    rules=rules_list,
    parallel=True
)

from morphik.rules import MetadataExtractionRule, NaturalLanguageRule
from pydantic import BaseModel

class ReportMetadata(BaseModel):
    title: str
    author: str
    
class DataMetadata(BaseModel):
    rows: int
    columns: int
    
# Define different rules for each file
rules_list = [
    [MetadataExtractionRule(schema=ReportMetadata)],
    [MetadataExtractionRule(schema=DataMetadata)],
    [NaturalLanguageRule(prompt="Extract key points only")]
]

files = ["report.pdf", "data.csv", "presentation.pptx"]

result = db.ingest_files(
    files=files,
    rules=rules_list,
    parallel=True
)

from morphik.rules import MetadataExtractionRule, NaturalLanguageRule
from pydantic import BaseModel

class ReportMetadata(BaseModel):
    title: str
    author: str
    
class DataMetadata(BaseModel):
    rows: int
    columns: int
    
# Define different rules for each file
rules_list = [
    [MetadataExtractionRule(schema=ReportMetadata)],
    [MetadataExtractionRule(schema=DataMetadata)],
    [NaturalLanguageRule(prompt="Extract key points only")]
]

files = ["report.pdf", "data.csv", "presentation.pptx"]

result = await db.ingest_files(
    files=files,
    rules=rules_list,
    parallel=True
)

Using Different File Input Types

import io
from pathlib import Path

# Mixing different file input types
file1 = "document.pdf"                       # Path string
file2 = Path("image.png")                    # Path object
file3 = open("data.csv", "rb")               # File object
file4 = b"Hello, world!"                     # Bytes (requires filename)
file5 = io.BytesIO(b"Some in-memory data")   # BytesIO (requires filename)

result = db.ingest_files(
    files=[file1, file2, file3, file4, file5],
    metadata=[
        {"type": "document"},
        {"type": "image"},
        {"type": "data"},
        {"type": "text", "filename": "hello.txt"},
        {"type": "text", "filename": "memory-data.txt"}
    ]
)

# Don't forget to close file objects
file3.close()

import io
from pathlib import Path

# Mixing different file input types
file1 = "document.pdf"                       # Path string
file2 = Path("image.png")                    # Path object
file3 = open("data.csv", "rb")               # File object
file4 = b"Hello, world!"                     # Bytes (requires filename)
file5 = io.BytesIO(b"Some in-memory data")   # BytesIO (requires filename)

result = db.ingest_files(
    files=[file1, file2, file3, file4, file5],
    metadata=[
        {"type": "document"},
        {"type": "image"},
        {"type": "data"},
        {"type": "text", "filename": "hello.txt"},
        {"type": "text", "filename": "memory-data.txt"}
    ]
)

# Don't forget to close file objects
file3.close()

import io
from pathlib import Path

# Mixing different file input types
file1 = "document.pdf"                       # Path string
file2 = Path("image.png")                    # Path object
file3 = open("data.csv", "rb")               # File object
file4 = b"Hello, world!"                     # Bytes (requires filename)
file5 = io.BytesIO(b"Some in-memory data")   # BytesIO (requires filename)

result = await db.ingest_files(
    files=[file1, file2, file3, file4, file5],
    metadata=[
        {"type": "document"},
        {"type": "image"},
        {"type": "data"},
        {"type": "text", "filename": "hello.txt"},
        {"type": "text", "filename": "memory-data.txt"}
    ]
)

# Don't forget to close file objects
file3.close()

Client

Document Ingestion

Document Retrieval

Data Organization

Document Updates

Batch Operations

Knowledge Graph Operations

Cache Management

Usage

Parameters

Returns

Advanced Examples

Per-File Metadata

Per-File Rules

Using Different File Input Types

Client

Document Ingestion

Document Retrieval

Data Organization

Document Updates

Batch Operations

Knowledge Graph Operations

Cache Management

​Usage

​Parameters

​Returns

​Advanced Examples

​Per-File Metadata

​Per-File Rules

​Using Different File Input Types

Usage

Parameters

Returns

Advanced Examples

Per-File Metadata

Per-File Rules

Using Different File Input Types