Document Type JSON Structure
Hotkeys
Input JSON for processed PDF files
Output JSON for processed PDF files

Information Extraction (IE) is a process of extracting structured information (or key facts) from unstructured and/or semi-structured documents (invoices, claims, dividend news, etc.).

Information Extraction Human Task can be utilized for:

Saving and processing data extracted by a human in Automation Process
Collecting Training Set for Machine Learning model training
Verification of data extracted by Machine Learning model

Document Type JSON Structure

Below you can find an example of the Settings JSON for the Information Extraction Task:

Information Extraction Document Type JSON example

{
	"appLanguage": "en",
	"taskInstructionText": "Read the document carefully and tag the fields required for extraction.",
	"taskInstructionLink": "",
	"taskTypeLabel": "Information Extraction",
	"allowCustomValue": true,
	"autoSave": false,
	"regexFunctions": {
		"numberMore10": "(value) => { return Number(value.replace(/,/g, '.')) > 10}",
		"lengthMore3": "(value) => { return value.length > 3}",
		"startSymbol": "(value) => { return new RegExp(/^\$.*/).test(value); }"
	},
	"colors": [
		"f79494",
		"ffaaff"
	],
	"metadata": [
		{
			"name": "input_custom_error_1",
			"type": "input",
			"label": "Input with custom error",
			"validationRegExp": "^[a-zA-Z]+$",
			"errorMessage": "Example of custom error"
		},
		{
			"name": "datepicker_5",
			"label": "format D MMM YYYY",
			"type": "date",
			"format": "D MMM YYYY"
		},
		{
			"name": "datepicker_4",
			"label": "format MMMM DD",
			"type": "date",
			"format": "MMMM DD"
		},
		{
			"name": "datepicker_3",
			"label": "keyboard format YYYY",
			"type": "date",
			"keyboard": true,
			"format": "YYYY"
		},
		{
			"name": "datepicker_2",
			"label": "keyboard format MM-DD-YYYY",
			"type": "date",
			"keyboard": true,
			"format": "MM-DD-YYYY"
		},
		{
			"name": "datepicker_1",
			"label": "keyboard format MM.DD.YYYY",
			"type": "date",
			"keyboard": true,
			"format": "MM.DD.YYYY"
		},
		{
			"name": "select_multiple_1",
			"type": "select",
			"required": true,
			"multiple": true,
			"label": "Multiple Select",
			"items": [
				"string_1",
				"string_2",
				"string_3"
			]
		},
		{
			"name": "select_2",
			"type": "select",
			"required": true,
			"label": "Single Select",
			"items": [
				{
					"value": "real_value_1",
					"label": "label_1"
				},
				{
					"value": "real_value_2",
					"label": "label_2"
				},
				{
					"value": "real_value_3"
				}
			]
		},
		{
			"name": "autocomplete_multiple_1",
			"type": "select",
			"required": true,
			"multiple": true,
			"autocomplete": true,
			"label": "Multiple Autocomplete",
			"description": "Description for Autocomplete",
			"items": [
				"string_1",
				"string_2",
				"string_3"
			]
		},
		{
			"name": "autocomplete_2",
			"type": "select",
			"required": true,
			"autocomplete": true,
			"label": "Single Autocomplete",
			"items": [
				{
					"value": "real_value_1",
					"label": "label_1"
				},
				{
					"value": "real_value_2",
					"label": "label_2"
				},
				{
					"value": "real_value_3"
				}
			]
		},
		{
			"name": "checkbox_1",
			"type": "checkbox",
			"markLabel": "Checkbox Label",
			"label": "Single Checkbox"
		},
		{
			"name": "input_with_mask_1",
			"type": "input",
			"label": "Input with mask",
			"mask": "+7 (999) 999-99-99"
		},
		{
			"name": "input_with_mask",
			"type": "input",
			"label": "Example of a mask characters escape",
			"mask": "+4\9 99 999 99"
		},
		{
			"name": "number_input_1",
			"type": "number_input",
			"label": "Number Input Example"
		},
		{
			"name": "textarea_1",
			"type": "textarea",
			"label": "Text Area minRows: 5 maxRows:8",
			"minRows": 5,
			"maxRows": 8
		},
		{
			"name": "checkbox_group_1",
			"type": "checkbox_group",
			"label": "Checkbox Group String Values",
			"description": "Description for checkbox group",
			"items": [
				"string_1",
				"string_2",
				"string_3"
			]
		},
		{
			"name": "radio_group_1",
			"type": "radio_group",
			"required": true,
			"label": "Radio Group",
			"description": "Description forradio group",
			"items": [
				{
					"value": "real_value_1",
					"label": "label_1"
				},
				{
					"value": "real_value_2",
					"label": "label_2",
					"disabled": true
				},
				{
					"value": "real_value_3"
				}
			]
		},
		{
			"type": "info",
			"label": "Text info with paragraphs",
			"text": [
				"Donec neque elit, mattis nec urna ac, ullamcorper sagittis nibh. Donec nec maximus ipsum. Quisque consequat mauris arcu, et porta ligula sollicitudin a. Cras vitae sodales lectus. Nunc turpis nisl, cursus cursus lacus non, sollicitudin iaculis eros. Donec nibh erat, scelerisque non dui eu, aliquet pulvinar felis..",
				"Etiam ultrices turpis nulla, eu vestibulum quam ultricies quis. Nullam vel lacus scelerisque, semper ante id, dapibus augue. Pellentesque id nisl id tellus rutrum consequat a pretium eros. Integer lacinia odio non est aliquet consectetur. Cras id lacus sit amet orci suscipit fermentum ac nec lorem. Donec et mauris ut enim semper porta."
			]
		},
		{
			"name": "info_1",
			"type": "info",
			"label": "Text Info",
			"text": "Donec neque elit, mattis nec urna ac, ullamcorper sagittis nibh. Donec nec maximus ipsum. Quisque consequat mauris arcu, et porta ligula sollicitudin a. Cras vitae sodales lectus. Nunc turpis nisl, cursus cursus lacus non, sollicitudin iaculis eros. Donec nibh erat, scelerisque non dui eu, aliquet pulvinar felis. Nam rhoncus libero in magna aliquet, eget rutrum magna dictum. Pellentesque molestie turpis fermentum vulputate mattis."
		}
	],
	"categories": [
		{
			"name": "Reporter Name",
			"label": "Reporter Name Label",
			"multiple": false,
			"required": true,
			"hotkey": [
				"Ctrl",
				"Q"
			]
		},
		{
			"name": "Reporter Address",
			"multiple": false
		},
		{
			"name": "Issuer Name",
			"multiple": false,
			"required": true
		},
		{
			"name": "Transaction Date",
			"multiple": false,
			"iconType": "date",
			"required": true,
			"hotkey": [
				"1"
			]
		},
		{
			"name": "Security Title",
			"multiple": true
		},
		{
			"name": "Price",
			"multiple": true,
			"iconType": "money",
			"helperText": "should started with symbol $",
			"regexFnc": [
				"startSymbol"
			]
		},
		{
			"name": "Amount",
			"multiple": true,
			"regex": "^[0-9]*$",
			"helperText": "Number should be more than 10",
			"regexFnc": [
				"numberMore10"
			],
			"hotkey": [
				"Alt",
				"P"
			]
		},
		{
			"name": "Ownership",
			"multiple": true
		}
	]
}

These settings contain:

regexFunctions (object) (optional) - a special object which contains "key" → "value" pairs to specify custom fields validators, where "key" is your custom function name which can be used as a link in "regexFnc" option of field settings under "categories" setting. "value" - is your custom JavaScript lambda function.
taskInstructionText (string) (optional) - represents the instructions text that will appear in the popup
taskInstructionLink (string) (optional) - represents the link to the remote instructions source. Having at least one of these fields taskInstructionText or taskInstructionLink will cause the Instructions button to appear.
taskTypeLabel (string) (optional) - allows to configure the task title. By default, it is set to "Information Extraction".
autoSave (boolean) (optional) - allows saving intermediate task results automatically. By default, the setting has "false" value.
allowCustomValue (boolean) (optional) - enables the possibility to enter a custom value without any connection to the input document picture. This possibility is useful when OCR failed to extract some text from the document.
appLanguage (string) (optional) - allows the user to set up the Human Task localization. Currently available options are "en" and "ru". By default, the task displays as "en".
excludeUndefinedEntities (boolean) (optional) - allows getting the output only of fields that are configured in "categories" setting. By default, the setting has "false" value.
commentsSectionName (string) (optional) - is used to rename the "More" tab.
colors (list of strings) (optional) - is used to add more colors during fields tagging the original document.
categories (list of objects) (required) - list of fields to extract from the document, where each item contains:
- name (string) (required) - it serves as a key to receiving data from Human Task output.
- label (string) (optional) - is used to set displayed name for a field. If no label presented displayed name will be set from name property.
- multiple (boolean) (required) - shows if this field may have multiple values (uses when you have the list with details of some items in your document, e.g. a list of products).
- required (boolean) (optional) - shows if the field is required for extraction, so human won't be able to submit Human Task without specifying the values for all required fields.
- helperText (string) (optional) - an additional text which is shown if the value is invalid due to additional validators specified by regexp settings.
- regex (string) (optional) - specify regexp function to use for values validation.
- regexFnc (list of string) (optional) - list of names of validator functions, which are specified by in "regexFunctions".
- iconType (string) (optional) - allows to configure the field icon. Currently available options are "date", "money", "multiple", "text." The default icon is "text".
- hotkey (list of string) (optional) - list of keyboard shortcuts that can be pressed to select the item in the category.
metadata (list of objects) (optional) - is used to configure the "More" tab in Workspace and Document view in Document sets. The "More" tab is disabled if metadata is not defined. Checkbox "Invalid Document" on the "More" tab can be configured, but cannot be removed. If checkbox "Invalid Document" is checked all validations will be disabled.

Default "Invalid Document" checkbox can be customized using the settings below:

- name (string) (required) - is used to declare the section that is used to configure the name and description of the "Invalid Document" checkbox. The default value is "isInvalid" and it cannot be changed, if changed the markLabel and description settings will have the default values.
- markLabel (string) (optional) - is used to define the name of the default checkbox in the "More" tab. By default, the setting has an "Invalid Document" value.
- description (string) (optional) - allows to add the description to default "Invalid Document" checkbox. By default, the description is not displayed.

Additional fields can be added to the "More" tab using the settings below:

name (string) (required) - it serves as a key to receiving data from "More" tab from Human Task output. Required for the following types: input, number_input, textarea, date, radio_group, select, checkbox, checkbox_group, date.
label (string) (optional) - is used to set a name for a field.
type (string) (required) - specifies the type of field. Should be one of the supported types: input, number_input, textarea, date, radio_group, select, checkbox, checkbox_group, info, date.
required (boolean) (optional) - shows if the field is required for filling in the "More" tab, so human won't be able to submit Human Task without specifying the values for all required fields in the "More" tab except the case when the checkbox "Invalid Document" is checked.
multiple (boolean) (optional) - is used only if "type": "select". Allows to select multiple items in the select dropdown list. Can work with autocomplete setting.
autocomplete (boolean) (optional) - is used only if "type": "select". Allows to search by items in the select dropdown list. Can work with multiple setting.
mask (string) (optional) - is used only if "type": "input". Allows to set a string of characters that indicates the format of valid input values. Default format characters: "9" for 0-9 characters; "a" for A-Z and a-z characters; "" for A-Z, a-z, 0-9 characters. If required to have exactly "9" character (e.g., a phone code), use "\" before the character (e.g., "+4\9 99 999 99"). If "required": true and a mask are set for "type": "input", input must be filled completely according to the mask.
minRows (number) (optional) - is used only if "type": "textarea". Specifies the minimum number of lines to display in the textarea. By default, the setting has the value "2".
maxRows (number) (optional) - is used only if "type": "textarea". Specifies the maximum number of lines to display in the textarea without a scroll appearing. If the number of rows is not within maxRows setting, a scroll appears. There is no default value and the field grows without limit.
disabled (boolean) (optional) - provides the possibility to disable the field which will prevent the user from filling this field.
validationRegExp (string) (optional) - provides possibility to validate filled values using regular expressions.
description (string) (optional) - allows to add a description to a field. Can be added for the following types: input, number_input, textarea, date, radio_group, select, checkbox, checkbox_group, date.
errorMessage (string) (optional) - is used to set the text for the error if the filled values do not match the regular expression in validationRegExp setting.
items (list of objects) (required only for "type": "checkbox_group", "radio_group", "select") - is used only if "type": "checkbox_group", "radio_group","select". Specifies possible items to check. Each item in the list has the following properties:
- value (string) (required) - the value of the item to display.
- label (string) (optional) - the label of the item to display. If no label is set, value is displayed instead. If both settings are set, label is displayed on Human Task, but value is set in Human Task Output.
- disabled (boolean) (optional) - provides the possibility to disable the item which will prevent the user from selecting it.
markLabel (string) (optional) - is used only if "type": "checkbox". Displays the label of the checkbox.
text (string or array) (optional) - is used only if "type": "info". Displays uneditable text for info field. If the text contains an array of strings, each string will be considered as a new paragraph.
keyboard (boolean) (optional) - is used only if "type": "date". Allows to enable manual date entry for datepicker form. By default, the setting has the value "false".

format (string) (optional) - is used only if "type": "date". Allows to set a custom date format. By default, the setting has a "MM/DD/YYYY" value. Supports the following formatting tokens for dates:

	Token	Output
Month	M	1 2 ... 11 12
	MM	01 02 ... 11 12
	MMM	Jan Feb ... Nov Dec
	MMMM	January February ... November December
Day of Month	D	1 2 ... 30 31
	DD	01 02 ... 30 31
Year	YY	70 71 ... 29 30
	YYYY	1970 1971 ... 2029 2030

Hotkeys

Information Extraction Human Task has the next hotkeys:

Tab - to switch to the next field on the right panel.
Shift + Tab - to switch to the previous field on the right panel.
Shift + area selection in the document - to append selected words to the end of the current field.+
Shift + Enter - to add a line break in the edit mode.
Double click on bbox in the document - to apply the tag to the current field.
Ctrl + Z - to cancel last action.
Ctrl + Shift + Z - to return the last canceled action back.
Ctrl + Mouse Wheel - to zoom in/zoom out the document.

Category hotkey rules:

should not be busy yet;
should not be repeated;
should not contain multiple letters or digits;
should contain only existing keyboard keys;
can be in any register;
can be one or more keys.

Input JSON for processed PDF files

Represents the document's OCR result in JSON format superimposed on the original document picture. It is generated automatically during OCR preprocessing:

Input JSON example for PDF documents

{
	"images": [ // There should be an array to support multipage PDFs. Each page should be defined individually in this case.
		{
			"content": "https://some-provider.com/img.jpg",
			//"json": <OCR-JSON>,
			"json_src" : "https://....ocr.json",
			"dimensions": { // The dimensions of the page in pixels.
				"width": 2000,
				"height": 3000
			}
		}
	]
}

Input JSON contains:
- - images (list of objects) - the root element containing a list of document configurations. The list usually contains only one element.
    - content (url or base64 string) - the source of the input document to display. It may be an URL to a document or the document's content encoded in base64 (e.g. the string value "data:image/jpg;base64,R0lGOD....").
    - json_src or json (http link to a OCR-JSON file or JSON itself) - provides OCR information. OCR-JSON structure is described below.
    - dimensions (object) - an object contains "width" and "height" parameters which represent the width and height of the original input document.
      - width (integer) - width value of original input document.
      - height (integer) - height value of original input document.

OCR-JSON object has the following structure which is generated by the OCR component by itself:

OCR-JSON

"json": {
	"pages": [
		"id": "page0",
		"areas": [
			{
				"id": "page0_area0",
				"paragraphs": [
					{
						"id": "page0_area0_paragraph0",
						"lines": [
							{
								"id": "page0_area0_paragraph0_line0",
								"words": [
									{
										"id": "page0_area0_paragraph0_line0_word0",
										"text": "Advanced",
										"properties": {
											"bbox": [
												0.05999032414126754,
												0.037290455011974,
												0.14078374455732948,
												0.046527540198426275
											],
										"x_fsize": 0,
										"x_wconf": 96
									}
								},
							}
						...
						]
					}
				]
			}
		]
	}

This JSON contains all OCRed words and is kept in a tree-like structure: pages → areas → paragraphs → lines → words

json (object) - root element
- pages (list of objects) - list of pages structure. Each page in the list has the following structure:
  - id (string) - id of the page
  - areas (list of objects) - list of areas structure. Each area in the list has the following structure:
    - id (string) - id of the area
    - paragraphs (list of objects) - list of paragraphs structure. Each paragraph in the list has the following structure:
      - id (string) - id of the paragraph
      - lines (list of objects) - list of lines structure. Each line in the list has the following structure:
        id (string) - id of the line
        words (list of objects) - list of words structure. Each word in the list has the following structure:
        id (string) - id of the word
        text (string) - original text extracted by OCR engine
        properties (object) - property object with the following structure:
        bbox (list of integers) - top-left and bottom-right coordinates of the rectangle around the word in the original document. Coordinates are normalized to be from 0 to 1 relative to original document size
        x_fsize (integer) - is the OCR-engine specific font size
        x_wconf (integer) - OCR-engine specific confidence for the entire contained substring. Higher values express higher confidence

The following additional settings can be used in Input JSON:

overrides (optional) - allows overriding the field and its options from Document Type Settings if it is used as default Input. The field to be overridden should be declared in the block by its name. Then settings of the field to be overridden has to be declared and defined. Please note, that "name" and "type" cannot be overridden.
"overrides": {
"input_custom_error_1": {
"label": "Overrided input with custom error"
}
} - "label" for the field with the name "input_custom_error_1" will be overridden with the set value.
messages (optional) - allows displaying the block with a message or several messages. The block can be configured with various severity parameters: "info", "warning", "error".
- "messages": ["Info message"] - to display the block with message "Info message" with default severity "info";
- "messages": [{ "severity": "info", "text": "Info message" }] - to display the info message "Info message";
- "messages": [{ "severity": "warning", "text": "Warning message" }] - to display the warning message "Warning message";
- "messages": [{ "severity": "error", "text": "Error message" }] - to display the error message "Error message";
- "messages":
  [
  {
  "severity": "error",
  "text": "Error message"
  },
  {
  "severity": "info",
  "text": "Info message"
  },
  {
  "severity": "warning",
  "text": "Warning message"
  }
  ], - to display several messages of different severity.

Output JSON for processed PDF files

As an Output for PDF Information Extraction Human Task produces the following JSON:

Output JSON example for PDF Documents

{
	"entities": [
		{
			"content": "NASH JEFFREY M",
			"name": "Reporter Name",
			"words": [
				{
					"content": "NASH",
					"bbox": [
						0.06562091503267974,
						0.17454545454545456,
						0.1126797385620915,
						0.18383838383838383
					],
					"id": "page0_area17_paragraph0_line7_word0",
					"page": 0
				},
				{
					"content": "JEFFREY",
					"bbox": [
						0.11764705882352941,
						0.17474747474747473,
						0.18797385620915033,
						0.18383838383838383
					],
					"id": "page0_area17_paragraph0_line7_word1",
					"page": 0
				},
				{
					"content": "M",
					"bbox": [
						0.19294117647058823,
						0.17474747474747473,
						0.20784313725490197,
						0.18363636363636363
					],
					"id": "page0_area17_paragraph0_line7_word2",
					"page": 0
				}
			],
			"index": 0,
			"multiple": false
		}
	],
	"target": "metamask-inpage",
	"data": {
		"name": "metamask-provider",
		"data": {
			"method": "metamask_chainChanged",
			"params": {
				"chainId": "0x38",
				"networkVersion": "56"
			}
		}
	},
	"metadata": {
		"input_custom_error_1": "test",
		"datepicker_5": "2 Apr 2022",
		"datepicker_4": "April 03",
		"datepicker_3": "2022",
		"select_multiple_1": [
			"string_1",
			"string_2"
		],
		"select_2": "real_value_1",
		"autocomplete_multiple_1": [
			"string_1"
		],
		"checkbox_1": true,
		"number_input_1": "7",
		"radio_group_1": "real_value_1"
	}
}

It has the following structure:

entities (list of objects) - root element which contains a list of extracted entities. Each entity in this list has the structure as described below:
- content (string) - final output text of the extracted entity.
- name (string) - extracted entity name.
- words (list of objects) - list of word objects. If the extracted text consists of several words, this list will contain several word objects as follows:
  - content (string) - original text from the input document.
  - bbox (list of integers) - top-left and bottom-right coordinates of the rectangle surrounding the word in the original document. The coordinates are normalized to be from 0 to 1 relative to the original document size.
  - id (string) - id of the word.
  - page (integer) - original document page number where the word appears.
- index (integer) - entity index.
- multiple (boolean) - shows if this field may have multiple values.
metadata (map) - exists only if fields from the More/renamed metadata section were filled. Represented as a key-value structure, where is the key - is the field name defined in Document Type Settings and the value is a value put in a field.