class mlrun.artifacts.document.DocumentArtifact(original_source: str | None = None, document_loader_spec: DocumentLoaderSpec | None = None, collections: dict | None = None, **kwargs)[source]#

Bases: Artifact

A specific artifact class inheriting from generic artifact, used to maintain Document meta-data.

class DocumentArtifactSpec(*args, document_loader: DocumentLoaderSpec | None = None, original_source: str | None = None, **kwargs)[source]#

Bases: ArtifactSpec

class DocumentArtifactStatus(*args, collections: dict | None = None, **kwargs)[source]#

Bases: ArtifactStatus

METADATA_ARTIFACT_KEY = 'mlrun_key'#
METADATA_ARTIFACT_PROJECT = 'mlrun_project'#
METADATA_ARTIFACT_TAG = 'mlrun_tag'#
METADATA_ARTIFACT_TARGET_PATH_KEY = 'mlrun_target_path'#
METADATA_CHUNK_KEY = 'mlrun_chunk'#
METADATA_ORIGINAL_SOURCE_KEY = 'original_source'#
METADATA_SOURCE_KEY = 'source'#
collection_add(collection_id: str) bool[source]#

Add a collection ID to the artifact's collection list.

Adds the specified collection ID to the artifact's collection mapping if it doesn't already exist. This method only modifies the client-side artifact object and does not persist the changes to the MLRun DB. To save the changes permanently, you must call project.update_artifact() after this method.

Parameters:

collection_id (str) -- The ID of the collection to add

collection_remove(collection_id: str) bool[source]#

Remove a collection ID from the artifact's collection list.

Removes the specified collection ID from the artifact's local collection mapping. This method only modifies the client-side artifact object and does not persist the changes to the MLRun DB. To save the changes permanently, you must call project.update_artifact() or context.update_artifact() after this method.

Parameters:

collection_id (str) -- The ID of the collection to remove

get_source()[source]#

Get the source URI for this artifact.

static key_from_source(src_path: str) str[source]#

Convert a source path into a valid artifact key by replacing invalid characters with underscores. :param src_path: The source path to be converted into a valid artifact key :type src_path: str

Returns:

A modified version of the source path where all invalid characters are replaced

with underscores while preserving valid sequences in their original positions

Return type:

str

Examples

>>> DocumentArtifact.key_from_source("data/file-name(v1).txt")
"data_file-name_v1__txt"
kind = 'document'#
property spec: DocumentArtifactSpec#
property status: DocumentArtifactStatus#
to_langchain_documents(splitter: TextSplitter | None = None) list['Document'][source]#
class mlrun.artifacts.document.DocumentLoaderSpec(loader_class_name: str = 'langchain_community.document_loaders.TextLoader', src_name: str = 'file_path', download_object: bool = True, kwargs: dict | None = None)[source]#

Bases: ModelObj

A class to load a document from a file path using a specified loader class.

This class is responsible for loading documents from a given source path using a specified loader class. The loader class is dynamically imported and instantiated with the provided arguments. The loaded documents can be optionally uploaded as artifacts. Note that only loader classes that return single results (e.g., TextLoader, UnstructuredHTMLLoader, WebBaseLoader(scalar)) are supported - loaders returning multiple results like DirectoryLoader or WebBaseLoader(list) are not compatible.

loader_class_name#

The name of the loader class to use for loading documents.

Type:

str

src_name#

The name of the source attribute to pass to the loader class.

Type:

str

kwargs#

Additional keyword arguments to pass to the loader class.

Type:

Optional[dict]

Initialize the document loader.

Parameters:
  • loader_class_name (str) -- The name of the loader class to use.

  • src_name (str) -- The source name for the document.

  • kwargs (Optional[dict]) -- Additional keyword arguments to pass to the loader class.

  • download_object (bool, optional) -- If True, the file will be downloaded before launching the loader. If False, the loader accepts a link that should not be downloaded. Defaults to True.

Example

>>> # Create a loader specification for PDF documents
>>> loader_spec = DocumentLoaderSpec(
...     loader_class_name="langchain_community.document_loaders.PDFLoader",
...     src_name="file_path",
...     kwargs={"extract_images": True},
... )
>>> # Create a loader instance for a specific PDF file
>>> pdf_loader = loader_spec.make_loader("/path/to/document.pdf")
>>> # Load the documents
>>> documents = pdf_loader.load()
make_loader(src_path)[source]#
class mlrun.artifacts.document.MLRunLoader(source_path: str, loader_spec: DocumentLoaderSpec, artifact_key='%%', producer: MlrunProject | str | MLClientCtx | None = None, upload: bool = False, tag: str = '', labels: dict[str, str] | None = None)[source]#

Bases: object

A factory class for creating instances of a dynamically defined document loader.

Parameters:
  • artifact_key (str, optional) -- The key for the artifact to be logged. The '%%' pattern in the key will be replaced by the source path with any unsupported characters converted to '_'. Defaults to "%%".

  • local_path (str) -- The source path of the document to be loaded.

  • loader_spec (DocumentLoaderSpec) -- Specification for the document loader.

  • producer (Optional[Union[MlrunProject, str, MLClientCtx]], optional) -- The producer of the document. If not specified, will try to get the current MLRun context or project. Defaults to None.

  • upload (bool, optional) -- Flag indicating whether to upload the document.

  • labels (Optional[Dict[str, str]], optional) -- Key-value labels to attach to the artifact. Defaults to None.

  • tag (str, optional) -- Version tag for the artifact. Defaults to "".

Returns:

An instance of a dynamically defined subclass of BaseLoader.

Return type:

DynamicDocumentLoader

Example

>>> # Create a document loader specification
>>> loader_spec = DocumentLoaderSpec(
...     loader_class_name="langchain_community.document_loaders.TextLoader",
...     src_name="file_path",
... )
>>> # Create a basic loader for a single file
>>> loader = MLRunLoader(
...     source_path="/path/to/document.txt",
...     loader_spec=loader_spec,
...     artifact_key="my_doc",
...     producer=project,
...     upload=True,
... )
>>> documents = loader.load()
>>> # Create a loader with auto-generated keys
>>> loader = MLRunLoader(
...     source_path="/path/to/document.txt",
...     loader_spec=loader_spec,
...     artifact_key="%%",  # %% will be replaced with encoded path
...     producer=project,
... )
>>> documents = loader.load()
>>> # Use with DirectoryLoader
>>> from langchain_community.document_loaders import DirectoryLoader
>>> dir_loader = DirectoryLoader(
...     "/path/to/directory",
...     glob="**/*.txt",
...     loader_cls=MLRunLoader,
...     loader_kwargs={
...         "loader_spec": loader_spec,
...         "artifact_key": "%%",
...         "producer": project,
...         "upload": True,
...     },
... )
>>> documents = dir_loader.load()
static artifact_key_instance(artifact_key: str, src_path: str) str[source]#