cleverdoc package

cleverdoc package#

Subpackages#

Submodules#

cleverdoc.enums module#

class cleverdoc.enums.Device(value)#

Bases: IntEnum

Execution Device

CPU = -1#

CUDA = 0#

class cleverdoc.enums.DicomCompression(value)#

Bases: str, Enum

An enumeration.

JPEGBaseline8Bit = 'JPEGBaseline8Bit'#

JPEGLSLossless = 'JPEGLSLossless'#

RLELossless = 'RLELossless'#

class cleverdoc.enums.ImageType(value)#

Bases: Enum

An enumeration.

FILE = 'file'#

OPENCV = 'opencv'#

PIL = 'pil'#

WEBP = 'webp'#

cleverdoc.license module#

class cleverdoc.license.LicenseValidator(*args, **kwargs)#

Bases: Singleton

License validator class :meta private:

ENV_VAR_NAME = 'CLEVERDOC_LICENSE'#

PUBLIC_KEY = '-----BEGIN PUBLIC KEY----- MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA1OF1bu10OgWxvjY8mjWI 6VV4jZMnvV7D7xfGnjiU1z/3E0wzD92yGcDuB0IFmLYgKJw4OcYXJSOWBXoWH+x+ J5i7Pqf04LGzxH7KN3G3Pg1yTVQ4RJxRn0GnMivYn0RC6rgByHT02wuS0S2GLCk8 eMGndKZt6gnJSRMvc8fjNktDafKJfPfDb5aUyRAXq7xiNMsgVDuGRk0nGtA8qqDv TsZauAF1pY88qgUxn7XANWrB7qZqkKvhDrAMnSOXsTTd9roLBLAhJ5V3TNePW2vh Fc1uvRi28WOVhbZSwlGUDN8cBGi4ElTyf7gD6iqplNMW5Q9Wc+I8gVtrv+vqkAgd xwIDAQAB -----END PUBLIC KEY-----'#

SUPPORT_EMAIL = 'license@apicom.pro'#

check_license()#

class cleverdoc.license.Singleton(*args, **kwargs)#: Bases: object

cleverdoc.params module#

class cleverdoc.params.HasColor#

Bases: Params

color = Param(parent='undefined', name='color', doc='Color.')#

getColor() → str#: Gets the value of color or its default value.

setColor(value)#: Sets the value of color.

class cleverdoc.params.HasImageType#

Bases: Params

getImageType()#: Sets the value of imageType.

imageType = Param(parent='undefined', name='imageType', doc='Image type.')#

setImageType(value)#: Sets the value of imageType.

class cleverdoc.params.HasInputCol#

Bases: Params

Mixin for param inputCol: input column name.

getInputCol() → str#: Gets the value of inputCol or its default value.

inputCol: Param[str] = Param(parent='undefined', name='inputCol', doc='input column name.')#

setInputCol(value)#: Sets the value of inputCol.

class cleverdoc.params.HasInputCols#

Bases: Params

Mixin for param inputCols: input column names.

getInputCols()#: Gets the value of inputCols or its default value.

inputCols: Param[List[str]] = Param(parent='undefined', name='inputCols', doc='input column names.')#

setInputCols(value)#: Sets the value of inputCol.

class cleverdoc.params.HasKeepInput#

Bases: Params

getKeepInput()#: Sets the value of keepInput.

keepInput = Param(parent='undefined', name='keepInput', doc='Keep input column in output.')#

setKeepInput(value)#: Sets the value of keepInput.

class cleverdoc.params.HasNumPartitions#

Bases: object

getNumPartitions()#: Gets the value of numPartitions.

numPartitions = Param(parent='undefined', name='numPartitions', doc='Number of partitions.')#

setNumPartitions(value)#: Sets the value of numPartitions.

class cleverdoc.params.HasOrigin#

Bases: Params

getOriginCol() → str#: Gets the value of inputCol or its default value.

originCol = Param(parent='undefined', name='originCol', doc='Input column name with original path of file.')#

setOriginCol(value)#: Sets the value of originCol.

class cleverdoc.params.HasOutputCol#

Bases: Params

Mixin for param outputCol: output column name.

getOutputCol() → str#: Gets the value of outputCol or its default value.

outputCol: Param[str] = Param(parent='undefined', name='outputCol', doc='output column name.')#

setOutputCol(value)#: Sets the value of outputCol.

class cleverdoc.params.HasResolution#

Bases: Params

POINTS_PER_INCH = 72#

getResolution()#: Gets the value of resolution.

resolution = Param(parent='undefined', name='resolution', doc='Resolution of image.')#

setResolution(value)#: Sets the value of resolution.

class cleverdoc.params.HasWhiteList#

Bases: Params

Mixin for param whiteList.

getWhiteList()#: Gets the value of whiteList or its default value.

setWhiteList(value)#: Sets the value of whiteList.

whiteList: Param[List[str]] = Param(parent='undefined', name='whiteList', doc='White list.')#

Module contents#

class cleverdoc.BinaryToImage(*args, **kwargs)#

Bases: Transformer, HasInputCol, HasOutputCol, HasKeepInput, HasImageType, HasOrigin, DefaultParamsReadable, DefaultParamsWritable, LicenseValidator

Transform Binary Content to Image

class cleverdoc.Device(value)#

Bases: IntEnum

Execution Device

CPU = -1#

CUDA = 0#

class cleverdoc.DicomCompression(value)#

Bases: str, Enum

An enumeration.

JPEGBaseline8Bit = 'JPEGBaseline8Bit'#

JPEGLSLossless = 'JPEGLSLossless'#

RLELossless = 'RLELossless'#

class cleverdoc.DicomDrawBoxes#

Bases: Transformer, HasInputCols, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable, HasKeepInput

Draw regions to dicom.

aggCols = Param(parent='undefined', name='aggCols', doc='agg column names.')#

blackList = Param(parent='undefined', name='blackList', doc='List of tags for clear.')#

static clean_tag_name(tag)#

compression = Param(parent='undefined', name='compression', doc='Compression type.')#

forceCompress = Param(parent='undefined', name='forceCompress', doc='True - Force compress image. False - compress only in case original image was compressed.')#

get_tags()#

removePrivateTags = Param(parent='undefined', name='removePrivateTags', doc='Remove private tags.')#

scaleFactor = Param(parent='undefined', name='scaleFactor', doc='Scale factor.')#

setAggCols(value)#: Sets the value of filledRect.

setCompression(value)#: Sets the value of compression.

setForceCompress(value)#: Sets the value of forceCompress.

setScaleFactor(value)#: Sets the value of scaleFactor.

transform_dicom(dicom, regions)#

class cleverdoc.DicomToImage(*args, **kwargs)#

Bases: Transformer, HasInputCols, HasOutputCol, HasKeepInput, HasOrigin, JavaMLReadable, JavaMLWritable, BaseDicom, LicenseValidator

Transform Dicom Binary Content to the Image

add_image(arr, ds, f, origin)#

frameLimit = Param(parent='undefined', name='frameLimit', doc='Limit number of frames for extraction. For extract all frames need to set this param to 0.')#

get_input_col(dataset, col_type, required=False, position=0)#

metadataCol = Param(parent='undefined', name='metadataCol', doc='Output column name for dicom metatdata.')#

pageNumCol = Param(parent='undefined', name='pageNumCol', doc='Page number output column name.')#

scale = Param(parent='undefined', name='scale', doc='Width of the desired input image. Image will be resized to this width.')#

setFrameLimit(value)#: Sets the value of frameLimit.

setInputCols(value)#: Sets the value of inputCol.

setMetadataCol(value)#: Sets the value of metadataCol.

setOutputCol(value)#: Sets the value of outputCol.

setPageNumCol(value)#: Sets the value of pageNumCol.

setScale(value)#: Sets the value of inputCol.

transform_dicom(origin, parts, dicom, params)#

Extract frames(images) from the dicom file

@param origin: Path to the file (generated by the binaryFile spark datasource) @param parts: List of the frames for extract @param dicom: Binary data or path to the local file @return: Rows with image, exception and number of frame for each item

class cleverdoc.Enum(value)#

Bases: object

Generic enumeration.

Derive from this class to define new enumerations.

class cleverdoc.ImageDrawBoxes#

Bases: Transformer, HasInputCols, HasOutputCol, HasKeepInput, HasImageType, DefaultParamsReadable, DefaultParamsWritable, HasColor, HasNumPartitions

Draw boxes on image

filled = Param(parent='undefined', name='filled', doc='Fill rectangle.')#

setFilled(value)#: Sets the value of filled.

class cleverdoc.ImageToPdf#

Bases: Transformer, DefaultParamsReadable, DefaultParamsWritable, HasOutputCol, HasInputCol, HasResolution

Transform Image to PDF

POINTS_PER_INCH = 72#

aggregatePages = Param(parent='undefined', name='aggregatePages', doc='Aggregate pages in one PDF document.')#

convert_to_pdf(images)#

getAggregatePages()#: Gets the value of aggregatePages or its default value.

setAggregatePages(value)#: Sets the value of aggregatePages.

class cleverdoc.ImageToString#

Bases: Transformer, HasInputCol, HasOutputCol, HasKeepInput, DefaultParamsReadable, DefaultParamsWritable

Extract text from image

config = Param(parent='undefined', name='config', doc='Config.')#

getConfig()#: Sets the value of config.

getScaleFactor()#: Sets the value of scaleFactor.

getScoreThreshold()#: Sets the value of scoreThreshold.

scaleFactor = Param(parent='undefined', name='scaleFactor', doc='Scale Factor.')#

scoreThreshold = Param(parent='undefined', name='scoreThreshold', doc='Scale Factor.')#

setConfig(value)#: Sets the value of config.

setScaleFactor(value)#: Sets the value of scaleFactor.

setScoreThreshold(value)#: Sets the value of scoreThreshold.

transform_local(image)#

transform_udf(image)#

class cleverdoc.ImageToStringOnnx#

Bases: Transformer, HasInputCol, HasOutputCol, HasKeepInput, DefaultParamsReadable, DefaultParamsWritable

Extract text from image

config = Param(parent='undefined', name='config', doc='Config.')#

getConfig()#: Sets the value of config.

getScaleFactor()#: Sets the value of scaleFactor.

getScoreThreshold()#: Sets the value of scoreThreshold.

scaleFactor = Param(parent='undefined', name='scaleFactor', doc='Scale Factor.')#

scoreThreshold = Param(parent='undefined', name='scoreThreshold', doc='Scale Factor.')#

setConfig(value)#: Sets the value of config.

setScaleFactor(value)#: Sets the value of scaleFactor.

setScoreThreshold(value)#: Sets the value of scoreThreshold.

transform_local(image)#

transform_udf(image)#

class cleverdoc.ImageType(value)#

Bases: Enum

An enumeration.

FILE = 'file'#

OPENCV = 'opencv'#

PIL = 'pil'#

WEBP = 'webp'#

class cleverdoc.IntEnum(value)#

Bases: int, Enum

Enum where members are also (and must be) ints

class cleverdoc.Ner(*args, **kwargs)#

Bases: Transformer, HasInputCol, HasOutputCol, HasKeepInput, HasWhiteList, DefaultParamsReadable, DefaultParamsWritable, HasNumPartitions, LicenseValidator

static aggregate_ner_results(text, pipeline, max_length=500, stride=256)#

batchSize = Param(parent='undefined', name='batchSize', doc='batchSize.')#

device = Param(parent='undefined', name='device', doc='Device.')#

getDevice()#: Sets the value of device.

getModel()#: Sets the value of model.

getThreshold()#: Gets the value of threshold or its default value.

get_pipeline()#

model = Param(parent='undefined', name='model', doc='Model name.')#

setBatchSize(value)#: Sets the value of batchSize.

setDevice(value)#: Sets the value of device.

setModel(value)#: Sets the value of model.

setThreshold(value)#: Sets the value of threshold.

static split_text(text, max_length=500, stride=256)#

threshold = Param(parent='undefined', name='threshold', doc='Device.')#

transform_local(text)#

transform_udf(image)#

static transform_udf_pandas(texts: DataFrame, params: Series) → DataFrame#

class cleverdoc.NerLLM#

Bases: Transformer, HasInputCol, HasOutputCol, HasKeepInput, HasWhiteList, DefaultParamsReadable, DefaultParamsWritable

getModel()#: Sets the value of model.

model = Param(parent='undefined', name='model', doc='Model name.')#

setModel(value)#: Sets the value of model.

transform_local(text)#

transform_udf(image)#

class cleverdoc.NerMerger#: Bases: Transformer, DefaultParamsReadable, DefaultParamsWritable, HasInputCols, HasOutputCol

class cleverdoc.NerRuleBased(*args, **kwargs)#

Bases: Transformer, HasInputCol, HasOutputCol, HasKeepInput, HasWhiteList, DefaultParamsReadable, DefaultParamsWritable, HasNumPartitions, LicenseValidator

getDevice()#: Sets the value of device.

getModel()#: Sets the value of model.

getThreshold()#: Gets the value of threshold or its default value.

setBatchSize(value)#: Sets the value of batchSize.

setDevice(value)#: Sets the value of device.

setModel(value)#: Sets the value of model.

setThreshold(value)#: Sets the value of threshold.

threshold = Param(parent='undefined', name='threshold', doc='Device.')#

static transform_udf_pandas(texts: DataFrame, params: Series) → DataFrame#

class cleverdoc.PdfAssembler#

Bases: Transformer, DefaultParamsReadable, DefaultParamsWritable, HasOutputCol, HasInputCol

Transform Image to PDF

convert_to_pdf(pdfs)#

class cleverdoc.PdfToImage(*args, **kwargs)#

Bases: BinaryToImage, HasResolution

getPageNumber()#

pageNumber = Param(parent='undefined', name='pageNumber', doc='Page number to convert to image')#

setPageNumber(value)#

class cleverdoc.SingleImageToPdf#

Bases: Transformer, DefaultParamsReadable, DefaultParamsWritable, HasOutputCol, HasInputCol, HasResolution

Transform Image to PDF

POINTS_PER_INCH = 72#

convert_to_pdf(image)#

class cleverdoc.StringToKeyValue#

Bases: Transformer, HasInputCol, HasOutputCol, HasKeepInput, HasWhiteList, DefaultParamsReadable, DefaultParamsWritable

getModel()#: Sets the value of model.

model = Param(parent='undefined', name='model', doc='Model name.')#

setModel(value)#: Sets the value of model.

transform_local(text)#

transform_udf(image)#

cleverdoc.get_aws_version()#

cleverdoc.get_name(path, keep_subfolder_level=0)#

cleverdoc.get_name_udf(path, keep_subfolder_level=0)#

cleverdoc.show_dicom(df, fields='content', limit=5, width=700, show_meta=True, only_meta=False, desc_width=60, limit_frame=5)#

cleverdoc.show_image(image, width=600, show_meta=True, id=0)#

cleverdoc.show_images(df, field='image', limit=5, width=600, show_meta=True)#

class cleverdoc.show_pdf_file(pdf, size=(600, 500))#: Bases: object

cleverdoc.start(license, extra_conf=None, master_url='local[*]', with_aws=False, logLevel='ERROR')#: Start Spark session with CleverDoc configuration @param extra_conf: Instance of SparkConf or dict with extra configuration.

cleverdoc package

Contents

cleverdoc package#

Subpackages#

Submodules#

cleverdoc.enums module#

cleverdoc.license module#

cleverdoc.params module#

Module contents#