Document Ingestion
Document Retrieval
Data Organization
Document Updates
Batch Operations
Knowledge Graph Operations
Cache Management
ingest_files
Batch ingest multiple files into Morphik
Usage
from morphik import Morphik
db = Morphik()
# Batch ingest files with shared metadata
result = db.ingest_files(
files=["document1.pdf", "document2.docx", "image.png"],
metadata={"category": "reports"},
use_colpali=True,
parallel=True
)
# Process the results
for doc in result["documents"]:
print(f"Successfully ingested: {doc.filename} (ID: {doc.external_id})")
# Check for errors
for error in result["errors"]:
print(f"Error ingesting {error.get('filename')}: {error.get('error')}")
from morphik import Morphik
db = Morphik()
# Batch ingest files with shared metadata
result = db.ingest_files(
files=["document1.pdf", "document2.docx", "image.png"],
metadata={"category": "reports"},
use_colpali=True,
parallel=True
)
# Process the results
for doc in result["documents"]:
print(f"Successfully ingested: {doc.filename} (ID: {doc.external_id})")
# Check for errors
for error in result["errors"]:
print(f"Error ingesting {error.get('filename')}: {error.get('error')}")
from morphik import AsyncMorphik
async with AsyncMorphik() as db:
# Batch ingest files with shared metadata
result = await db.ingest_files(
files=["document1.pdf", "document2.docx", "image.png"],
metadata={"category": "reports"},
use_colpali=True,
parallel=True
)
# Process the results
for doc in result["documents"]:
print(f"Successfully ingested: {doc.filename} (ID: {doc.external_id})")
# Check for errors
for error in result["errors"]:
print(f"Error ingesting {error.get('filename')}: {error.get('error')}")
Parameters
files
(List[Union[str, bytes, BinaryIO, Path]]): List of files to ingest (path strings, bytes, file objects, or Path objects)metadata
(Dict[str, Any] | List[Dict[str, Any]], optional): Metadata to apply to the files. Can be either:- A single dict to apply to all files
- A list of dicts, one per file (must match the length of
files
)
rules
(List, optional): Rules to apply during ingestion. Can be either:- A single list of rules to apply to all files
- A list of rule lists, one per file
use_colpali
(bool, optional): Whether to use ColPali-style embedding model. Defaults to True.parallel
(bool, optional): Whether to process files in parallel. Defaults to True.
Returns
An object containing:
documents
: List of successfully ingested Document objectserrors
: List of errors encountered during ingestion (each error is a dict with ‘filename’ and ‘error’ keys)
Advanced Examples
Per-File Metadata
# Ingest files with different metadata for each file
files = ["report.pdf", "data.csv", "presentation.pptx"]
# Metadata must match the length of files list
metadata_list = [
{"category": "reports", "author": "Alice"},
{"category": "data", "source": "database"},
{"category": "presentations", "department": "marketing"}
]
result = db.ingest_files(
files=files,
metadata=metadata_list
)
# Ingest files with different metadata for each file
files = ["report.pdf", "data.csv", "presentation.pptx"]
# Metadata must match the length of files list
metadata_list = [
{"category": "reports", "author": "Alice"},
{"category": "data", "source": "database"},
{"category": "presentations", "department": "marketing"}
]
result = db.ingest_files(
files=files,
metadata=metadata_list
)
# Ingest files with different metadata for each file
files = ["report.pdf", "data.csv", "presentation.pptx"]
# Metadata must match the length of files list
metadata_list = [
{"category": "reports", "author": "Alice"},
{"category": "data", "source": "database"},
{"category": "presentations", "department": "marketing"}
]
result = await db.ingest_files(
files=files,
metadata=metadata_list
)
Per-File Rules
from morphik.rules import MetadataExtractionRule, NaturalLanguageRule
from pydantic import BaseModel
class ReportMetadata(BaseModel):
title: str
author: str
class DataMetadata(BaseModel):
rows: int
columns: int
# Define different rules for each file
rules_list = [
[MetadataExtractionRule(schema=ReportMetadata)],
[MetadataExtractionRule(schema=DataMetadata)],
[NaturalLanguageRule(prompt="Extract key points only")]
]
files = ["report.pdf", "data.csv", "presentation.pptx"]
result = db.ingest_files(
files=files,
rules=rules_list,
parallel=True
)
from morphik.rules import MetadataExtractionRule, NaturalLanguageRule
from pydantic import BaseModel
class ReportMetadata(BaseModel):
title: str
author: str
class DataMetadata(BaseModel):
rows: int
columns: int
# Define different rules for each file
rules_list = [
[MetadataExtractionRule(schema=ReportMetadata)],
[MetadataExtractionRule(schema=DataMetadata)],
[NaturalLanguageRule(prompt="Extract key points only")]
]
files = ["report.pdf", "data.csv", "presentation.pptx"]
result = db.ingest_files(
files=files,
rules=rules_list,
parallel=True
)
from morphik.rules import MetadataExtractionRule, NaturalLanguageRule
from pydantic import BaseModel
class ReportMetadata(BaseModel):
title: str
author: str
class DataMetadata(BaseModel):
rows: int
columns: int
# Define different rules for each file
rules_list = [
[MetadataExtractionRule(schema=ReportMetadata)],
[MetadataExtractionRule(schema=DataMetadata)],
[NaturalLanguageRule(prompt="Extract key points only")]
]
files = ["report.pdf", "data.csv", "presentation.pptx"]
result = await db.ingest_files(
files=files,
rules=rules_list,
parallel=True
)
Using Different File Input Types
import io
from pathlib import Path
# Mixing different file input types
file1 = "document.pdf" # Path string
file2 = Path("image.png") # Path object
file3 = open("data.csv", "rb") # File object
file4 = b"Hello, world!" # Bytes (requires filename)
file5 = io.BytesIO(b"Some in-memory data") # BytesIO (requires filename)
result = db.ingest_files(
files=[file1, file2, file3, file4, file5],
metadata=[
{"type": "document"},
{"type": "image"},
{"type": "data"},
{"type": "text", "filename": "hello.txt"},
{"type": "text", "filename": "memory-data.txt"}
]
)
# Don't forget to close file objects
file3.close()
import io
from pathlib import Path
# Mixing different file input types
file1 = "document.pdf" # Path string
file2 = Path("image.png") # Path object
file3 = open("data.csv", "rb") # File object
file4 = b"Hello, world!" # Bytes (requires filename)
file5 = io.BytesIO(b"Some in-memory data") # BytesIO (requires filename)
result = db.ingest_files(
files=[file1, file2, file3, file4, file5],
metadata=[
{"type": "document"},
{"type": "image"},
{"type": "data"},
{"type": "text", "filename": "hello.txt"},
{"type": "text", "filename": "memory-data.txt"}
]
)
# Don't forget to close file objects
file3.close()
import io
from pathlib import Path
# Mixing different file input types
file1 = "document.pdf" # Path string
file2 = Path("image.png") # Path object
file3 = open("data.csv", "rb") # File object
file4 = b"Hello, world!" # Bytes (requires filename)
file5 = io.BytesIO(b"Some in-memory data") # BytesIO (requires filename)
result = await db.ingest_files(
files=[file1, file2, file3, file4, file5],
metadata=[
{"type": "document"},
{"type": "image"},
{"type": "data"},
{"type": "text", "filename": "hello.txt"},
{"type": "text", "filename": "memory-data.txt"}
]
)
# Don't forget to close file objects
file3.close()
Was this page helpful?