cleverdoc package

Contents

cleverdoc package#

Subpackages#

Submodules#

cleverdoc.enums module#

class cleverdoc.enums.Device(value)#

Bases: IntEnum

Execution Device

CPU = -1#
CUDA = 0#
class cleverdoc.enums.DicomCompression(value)#

Bases: str, Enum

An enumeration.

JPEGBaseline8Bit = 'JPEGBaseline8Bit'#
JPEGLSLossless = 'JPEGLSLossless'#
RLELossless = 'RLELossless'#
class cleverdoc.enums.ImageType(value)#

Bases: Enum

An enumeration.

FILE = 'file'#
OPENCV = 'opencv'#
PIL = 'pil'#
WEBP = 'webp'#

cleverdoc.license module#

class cleverdoc.license.LicenseValidator(*args, **kwargs)#

Bases: Singleton

License validator class :meta private:

ENV_VAR_NAME = 'CLEVERDOC_LICENSE'#
PUBLIC_KEY = '-----BEGIN PUBLIC KEY----- MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA1OF1bu10OgWxvjY8mjWI 6VV4jZMnvV7D7xfGnjiU1z/3E0wzD92yGcDuB0IFmLYgKJw4OcYXJSOWBXoWH+x+ J5i7Pqf04LGzxH7KN3G3Pg1yTVQ4RJxRn0GnMivYn0RC6rgByHT02wuS0S2GLCk8 eMGndKZt6gnJSRMvc8fjNktDafKJfPfDb5aUyRAXq7xiNMsgVDuGRk0nGtA8qqDv TsZauAF1pY88qgUxn7XANWrB7qZqkKvhDrAMnSOXsTTd9roLBLAhJ5V3TNePW2vh Fc1uvRi28WOVhbZSwlGUDN8cBGi4ElTyf7gD6iqplNMW5Q9Wc+I8gVtrv+vqkAgd xwIDAQAB -----END PUBLIC KEY-----'#
SUPPORT_EMAIL = 'license@apicom.pro'#
check_license()#
class cleverdoc.license.Singleton(*args, **kwargs)#

Bases: object

cleverdoc.params module#

class cleverdoc.params.HasColor#

Bases: Params

color = Param(parent='undefined', name='color', doc='Color.')#
getColor() str#

Gets the value of color or its default value.

setColor(value)#

Sets the value of color.

class cleverdoc.params.HasImageType#

Bases: Params

getImageType()#

Sets the value of imageType.

imageType = Param(parent='undefined', name='imageType', doc='Image type.')#
setImageType(value)#

Sets the value of imageType.

class cleverdoc.params.HasInputCol#

Bases: Params

Mixin for param inputCol: input column name.

getInputCol() str#

Gets the value of inputCol or its default value.

inputCol: Param[str] = Param(parent='undefined', name='inputCol', doc='input column name.')#
setInputCol(value)#

Sets the value of inputCol.

class cleverdoc.params.HasInputCols#

Bases: Params

Mixin for param inputCols: input column names.

getInputCols()#

Gets the value of inputCols or its default value.

inputCols: Param[List[str]] = Param(parent='undefined', name='inputCols', doc='input column names.')#
setInputCols(value)#

Sets the value of inputCol.

class cleverdoc.params.HasKeepInput#

Bases: Params

getKeepInput()#

Sets the value of keepInput.

keepInput = Param(parent='undefined', name='keepInput', doc='Keep input column in output.')#
setKeepInput(value)#

Sets the value of keepInput.

class cleverdoc.params.HasNumPartitions#

Bases: object

getNumPartitions()#

Gets the value of numPartitions.

numPartitions = Param(parent='undefined', name='numPartitions', doc='Number of partitions.')#
setNumPartitions(value)#

Sets the value of numPartitions.

class cleverdoc.params.HasOrigin#

Bases: Params

getOriginCol() str#

Gets the value of inputCol or its default value.

originCol = Param(parent='undefined', name='originCol', doc='Input column name with original path of file.')#
setOriginCol(value)#

Sets the value of originCol.

class cleverdoc.params.HasOutputCol#

Bases: Params

Mixin for param outputCol: output column name.

getOutputCol() str#

Gets the value of outputCol or its default value.

outputCol: Param[str] = Param(parent='undefined', name='outputCol', doc='output column name.')#
setOutputCol(value)#

Sets the value of outputCol.

class cleverdoc.params.HasResolution#

Bases: Params

POINTS_PER_INCH = 72#
getResolution()#

Gets the value of resolution.

resolution = Param(parent='undefined', name='resolution', doc='Resolution of image.')#
setResolution(value)#

Sets the value of resolution.

class cleverdoc.params.HasWhiteList#

Bases: Params

Mixin for param whiteList.

getWhiteList()#

Gets the value of whiteList or its default value.

setWhiteList(value)#

Sets the value of whiteList.

whiteList: Param[List[str]] = Param(parent='undefined', name='whiteList', doc='White list.')#

Module contents#

class cleverdoc.BinaryToImage(*args, **kwargs)#

Bases: Transformer, HasInputCol, HasOutputCol, HasKeepInput, HasImageType, HasOrigin, DefaultParamsReadable, DefaultParamsWritable, LicenseValidator

Transform Binary Content to Image

class cleverdoc.Device(value)#

Bases: IntEnum

Execution Device

CPU = -1#
CUDA = 0#
class cleverdoc.DicomCompression(value)#

Bases: str, Enum

An enumeration.

JPEGBaseline8Bit = 'JPEGBaseline8Bit'#
JPEGLSLossless = 'JPEGLSLossless'#
RLELossless = 'RLELossless'#
class cleverdoc.DicomDrawBoxes#

Bases: Transformer, HasInputCols, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable, HasKeepInput

Draw regions to dicom.

aggCols = Param(parent='undefined', name='aggCols', doc='agg column names.')#
blackList = Param(parent='undefined', name='blackList', doc='List of tags for clear.')#
static clean_tag_name(tag)#
compression = Param(parent='undefined', name='compression', doc='Compression type.')#
forceCompress = Param(parent='undefined', name='forceCompress', doc='True - Force compress image. False - compress only in case original image was compressed.')#
get_tags()#
removePrivateTags = Param(parent='undefined', name='removePrivateTags', doc='Remove private tags.')#
scaleFactor = Param(parent='undefined', name='scaleFactor', doc='Scale factor.')#
setAggCols(value)#

Sets the value of filledRect.

setCompression(value)#

Sets the value of compression.

setForceCompress(value)#

Sets the value of forceCompress.

setScaleFactor(value)#

Sets the value of scaleFactor.

transform_dicom(dicom, regions)#
class cleverdoc.DicomToImage(*args, **kwargs)#

Bases: Transformer, HasInputCols, HasOutputCol, HasKeepInput, HasOrigin, JavaMLReadable, JavaMLWritable, BaseDicom, LicenseValidator

Transform Dicom Binary Content to the Image

add_image(arr, ds, f, origin)#
frameLimit = Param(parent='undefined', name='frameLimit', doc='Limit number of frames for extraction. For extract all frames need to set this param to 0.')#
get_input_col(dataset, col_type, required=False, position=0)#
metadataCol = Param(parent='undefined', name='metadataCol', doc='Output column name for dicom metatdata.')#
pageNumCol = Param(parent='undefined', name='pageNumCol', doc='Page number output column name.')#
scale = Param(parent='undefined', name='scale', doc='Width of the desired input image. Image will be resized to this width.')#
setFrameLimit(value)#

Sets the value of frameLimit.

setInputCols(value)#

Sets the value of inputCol.

setMetadataCol(value)#

Sets the value of metadataCol.

setOutputCol(value)#

Sets the value of outputCol.

setPageNumCol(value)#

Sets the value of pageNumCol.

setScale(value)#

Sets the value of inputCol.

transform_dicom(origin, parts, dicom, params)#

Extract frames(images) from the dicom file

@param origin: Path to the file (generated by the binaryFile spark datasource) @param parts: List of the frames for extract @param dicom: Binary data or path to the local file @return: Rows with image, exception and number of frame for each item

class cleverdoc.Enum(value)#

Bases: object

Generic enumeration.

Derive from this class to define new enumerations.

class cleverdoc.ImageDrawBoxes#

Bases: Transformer, HasInputCols, HasOutputCol, HasKeepInput, HasImageType, DefaultParamsReadable, DefaultParamsWritable, HasColor, HasNumPartitions

Draw boxes on image

filled = Param(parent='undefined', name='filled', doc='Fill rectangle.')#
setFilled(value)#

Sets the value of filled.

class cleverdoc.ImageToPdf#

Bases: Transformer, DefaultParamsReadable, DefaultParamsWritable, HasOutputCol, HasInputCol, HasResolution

Transform Image to PDF

POINTS_PER_INCH = 72#
aggregatePages = Param(parent='undefined', name='aggregatePages', doc='Aggregate pages in one PDF document.')#
convert_to_pdf(images)#
getAggregatePages()#

Gets the value of aggregatePages or its default value.

setAggregatePages(value)#

Sets the value of aggregatePages.

class cleverdoc.ImageToString#

Bases: Transformer, HasInputCol, HasOutputCol, HasKeepInput, DefaultParamsReadable, DefaultParamsWritable

Extract text from image

config = Param(parent='undefined', name='config', doc='Config.')#
getConfig()#

Sets the value of config.

getScaleFactor()#

Sets the value of scaleFactor.

getScoreThreshold()#

Sets the value of scoreThreshold.

scaleFactor = Param(parent='undefined', name='scaleFactor', doc='Scale Factor.')#
scoreThreshold = Param(parent='undefined', name='scoreThreshold', doc='Scale Factor.')#
setConfig(value)#

Sets the value of config.

setScaleFactor(value)#

Sets the value of scaleFactor.

setScoreThreshold(value)#

Sets the value of scoreThreshold.

transform_local(image)#
transform_udf(image)#
class cleverdoc.ImageToStringOnnx#

Bases: Transformer, HasInputCol, HasOutputCol, HasKeepInput, DefaultParamsReadable, DefaultParamsWritable

Extract text from image

config = Param(parent='undefined', name='config', doc='Config.')#
getConfig()#

Sets the value of config.

getScaleFactor()#

Sets the value of scaleFactor.

getScoreThreshold()#

Sets the value of scoreThreshold.

scaleFactor = Param(parent='undefined', name='scaleFactor', doc='Scale Factor.')#
scoreThreshold = Param(parent='undefined', name='scoreThreshold', doc='Scale Factor.')#
setConfig(value)#

Sets the value of config.

setScaleFactor(value)#

Sets the value of scaleFactor.

setScoreThreshold(value)#

Sets the value of scoreThreshold.

transform_local(image)#
transform_udf(image)#
class cleverdoc.ImageType(value)#

Bases: Enum

An enumeration.

FILE = 'file'#
OPENCV = 'opencv'#
PIL = 'pil'#
WEBP = 'webp'#
class cleverdoc.IntEnum(value)#

Bases: int, Enum

Enum where members are also (and must be) ints

class cleverdoc.Ner(*args, **kwargs)#

Bases: Transformer, HasInputCol, HasOutputCol, HasKeepInput, HasWhiteList, DefaultParamsReadable, DefaultParamsWritable, HasNumPartitions, LicenseValidator

static aggregate_ner_results(text, pipeline, max_length=500, stride=256)#
batchSize = Param(parent='undefined', name='batchSize', doc='batchSize.')#
device = Param(parent='undefined', name='device', doc='Device.')#
getDevice()#

Sets the value of device.

getModel()#

Sets the value of model.

getThreshold()#

Gets the value of threshold or its default value.

get_pipeline()#
model = Param(parent='undefined', name='model', doc='Model name.')#
setBatchSize(value)#

Sets the value of batchSize.

setDevice(value)#

Sets the value of device.

setModel(value)#

Sets the value of model.

setThreshold(value)#

Sets the value of threshold.

static split_text(text, max_length=500, stride=256)#
threshold = Param(parent='undefined', name='threshold', doc='Device.')#
transform_local(text)#
transform_udf(image)#
static transform_udf_pandas(texts: DataFrame, params: Series) DataFrame#
class cleverdoc.NerLLM#

Bases: Transformer, HasInputCol, HasOutputCol, HasKeepInput, HasWhiteList, DefaultParamsReadable, DefaultParamsWritable

getModel()#

Sets the value of model.

model = Param(parent='undefined', name='model', doc='Model name.')#
setModel(value)#

Sets the value of model.

transform_local(text)#
transform_udf(image)#
class cleverdoc.NerMerger#

Bases: Transformer, DefaultParamsReadable, DefaultParamsWritable, HasInputCols, HasOutputCol

class cleverdoc.NerRuleBased(*args, **kwargs)#

Bases: Transformer, HasInputCol, HasOutputCol, HasKeepInput, HasWhiteList, DefaultParamsReadable, DefaultParamsWritable, HasNumPartitions, LicenseValidator

getDevice()#

Sets the value of device.

getModel()#

Sets the value of model.

getThreshold()#

Gets the value of threshold or its default value.

setBatchSize(value)#

Sets the value of batchSize.

setDevice(value)#

Sets the value of device.

setModel(value)#

Sets the value of model.

setThreshold(value)#

Sets the value of threshold.

threshold = Param(parent='undefined', name='threshold', doc='Device.')#
static transform_udf_pandas(texts: DataFrame, params: Series) DataFrame#
class cleverdoc.PdfAssembler#

Bases: Transformer, DefaultParamsReadable, DefaultParamsWritable, HasOutputCol, HasInputCol

Transform Image to PDF

convert_to_pdf(pdfs)#
class cleverdoc.PdfToImage(*args, **kwargs)#

Bases: BinaryToImage, HasResolution

getPageNumber()#
pageNumber = Param(parent='undefined', name='pageNumber', doc='Page number to convert to image')#
setPageNumber(value)#
class cleverdoc.SingleImageToPdf#

Bases: Transformer, DefaultParamsReadable, DefaultParamsWritable, HasOutputCol, HasInputCol, HasResolution

Transform Image to PDF

POINTS_PER_INCH = 72#
convert_to_pdf(image)#
class cleverdoc.StringToKeyValue#

Bases: Transformer, HasInputCol, HasOutputCol, HasKeepInput, HasWhiteList, DefaultParamsReadable, DefaultParamsWritable

getModel()#

Sets the value of model.

model = Param(parent='undefined', name='model', doc='Model name.')#
setModel(value)#

Sets the value of model.

transform_local(text)#
transform_udf(image)#
cleverdoc.get_aws_version()#
cleverdoc.get_name(path, keep_subfolder_level=0)#
cleverdoc.get_name_udf(path, keep_subfolder_level=0)#
cleverdoc.show_dicom(df, fields='content', limit=5, width=700, show_meta=True, only_meta=False, desc_width=60, limit_frame=5)#
cleverdoc.show_image(image, width=600, show_meta=True, id=0)#
cleverdoc.show_images(df, field='image', limit=5, width=600, show_meta=True)#
class cleverdoc.show_pdf_file(pdf, size=(600, 500))#

Bases: object

cleverdoc.start(license, extra_conf=None, master_url='local[*]', with_aws=False, logLevel='ERROR')#

Start Spark session with CleverDoc configuration @param extra_conf: Instance of SparkConf or dict with extra configuration.