Skip to main content

Import Text Regions to Text Files

A single text region object label can be added at a character range between start and end using range. Text Region Ontology Text region ontology

from encord import EncordUserClient, Project
from encord.objects import Object, ObjectInstance
from encord.objects.coordinates import TextCoordinates
from encord.objects.frames import Range
from encord.objects.attributes import RadioAttribute, TextAttribute, Option

# User input
SSH_PATH = "/Users/chris-encord/ssh-private-key.txt"  # Replace with the file path to your SSH private key
PROJECT_ID = "00000000-0000-0000-0000-000000000000"  # Replace with the unique Project ID
BUNDLE_SIZE = 100

# Authorize connection to Encord
user_client: EncordUserClient = EncordUserClient.create_with_ssh_private_key(
    ssh_private_key_path=SSH_PATH,
    # For US platform users use domain="https://api.us.encord.com"
    domain="https://api.encord.com",
)

# Get project
project: Project = user_client.get_project(PROJECT_ID)
assert project is not None, f"Project with ID {PROJECT_ID} not found."

# Ontology verification
ontology_structure = project.ontology_structure
assert ontology_structure is not None, "Ontology structure not found in the project."

text_object: Object = ontology_structure.get_child_by_title(title="Edit", type_=Object)
assert text_object is not None, "Ontology object for 'Edit' not found."

correction_radio_attribute = ontology_structure.get_child_by_title(
    type_=RadioAttribute, title="Corrections"
)
assert correction_radio_attribute is not None, "Radio attribute 'Corrections' not found."

english_correction_option = correction_radio_attribute.get_child_by_title(
    type_=Option, title="English corrections"
)
assert english_correction_option is not None, "Option 'English corrections' not found under 'Corrections'."

english_correction_text_attribute = english_correction_option.get_child_by_title(
    type_=TextAttribute, title="Correction text"
)
assert english_correction_text_attribute is not None, (
    "Text attribute 'Correction text' not found under 'English corrections'."
)

# Labels
text_annotations = {
    "paradise-lost.txt": [
        {
            "label_ref": "text_region_001",
            "coordinates": Range(start=5000, end=5050),
            "correction_text": "This needs to be updated for clarity.",
        },
        {
            "label_ref": "text_region_002",
            "coordinates": Range(start=6000, end=6050),
            "correction_text": "Rephrase for better readability.",
        },
    ],
    "War and Peace.txt": [
        {
            "label_ref": "text_region_003",
            "coordinates": Range(start=3000, end=3050),
            "correction_text": "Grammar correction required.",
        },
        {
            "label_ref": "text_region_004",
            "coordinates": Range(start=4000, end=4050),
            "correction_text": "Check for historical accuracy.",
        },
    ],
}

# Initialize label rows
label_row_map = {}

with project.create_bundle(bundle_size=BUNDLE_SIZE) as bundle:
    for data_title in text_annotations.keys():
        label_rows = project.list_label_rows_v2(data_title_eq=data_title)
        if not label_rows:
            print(f"Skipping: No label row found for {data_title}")
            continue
        lr = label_rows[0]
        lr.initialise_labels(bundle=bundle)
        label_row_map[data_title] = lr

# Apply labels
label_rows_to_save = []

for data_title, annotations in text_annotations.items():
    lr = label_row_map.get(data_title)
    if lr is None:
        print(f"Skipping: No initialized label row found for {data_title}")
        continue

    for ann in annotations:
        coord = TextCoordinates(range=[ann["coordinates"]])

        inst: ObjectInstance = text_object.create_instance()
        inst.set_for_frames(frames=0, coordinates=coord)
        inst.set_answer(attribute=correction_radio_attribute, answer=english_correction_option)
        inst.set_answer(attribute=english_correction_text_attribute, answer=ann["correction_text"])
        lr.add_object_instance(inst)

        print(f"Added [English correction] text region {ann['label_ref']} to {data_title}")

    label_rows_to_save.append(lr)

# Save label rows
with project.create_bundle(bundle_size=BUNDLE_SIZE) as bundle:
    for lr in label_rows_to_save:
        lr.save(bundle=bundle)
        print(f"Saved label row for {lr.data_title}")

print("English correction labels applied.")

Import Classifications to Text Files

The example for the Classification uses nested attributes with the Ontology structure as follows:
  • Accurate?
    • Yes
    • No
      • Correction (text field to provide edits for the correction)
create_instance must use range_only=True for text documents. This includes HTML documents.

# Import dependencies
from typing import List
from pathlib import Path
from encord import EncordUserClient, Project
from encord.objects.frames import Range
from encord.objects import LabelRowV2, Classification, Option, OntologyStructure

SSH_PATH = "<file-path-to-ssh-private-key>"
PROJECT_ID = "<project-unique-id>"

# Create user client using access key
user_client: EncordUserClient = EncordUserClient.create_with_ssh_private_key(
    Path(SSH_PATH).read_text()
)

# Get project for which predictions are to be added
project: Project = user_client.get_project(PROJECT_ID)

# Specify the data unit to apply classification
label_row = project.list_label_rows_v2(
    data_title_eq="<file-name-for-text-file>.html"
)[0]


# Download the existing labels 
label_row.initialise_labels()

# Get the Ontology structure
ontology_structure: OntologyStructure = label_row.ontology_structure

# Assume that the following radio button classification exists in the Ontology.
radio_ontology_classification: Classification = (
    ontology_structure.get_child_by_title(
        title="<classification-name>", type_=Classification
    )
)

radio_classification_option = radio_ontology_classification.get_child_by_title(
title="<option-name>",
type_=Option
)

# Create classification instance. `range_only=True` is required for HTML documents
radio_classification_instance = radio_ontology_classification.create_instance(range_only=True)

# Set the answer of the classification instance
radio_classification_instance.set_answer(radio_classification_option)

# Select the frames where the classification instance is present
radio_classification_instance.set_for_frames(frames=0)

# Add it to the label row
label_row.add_classification_instance(radio_classification_instance)

# Save labels
label_row.save()

Export Labels for Text Files


# Import dependencies
from encord import EncordUserClient
import json

SSH_PATH= "<file-path-to-ssh-private-key"
PROJECT_ID= "<project-unique-id>"
DATA_UNIT_NAME= "<file-name-of-html-file>"

# Instantiate client. Replace <private_key_path> with the path to the file containing your private key.
user_client = EncordUserClient.create_with_ssh_private_key(
    ssh_private_key_path=SSH_PATH
)

# Specify Project. Replace <project_hash> with the hash of the Project you want to export labels for.
project = user_client.get_project(PROJECT_ID)

# Specify the data unit you want to export labels for. Replace <file_name> with the name of your specific data unit.
specific_label_row = project.list_label_rows_v2(
    data_title_eq=DATA_UNIT_NAME
)[0]

# Download label information for the specific data unit
specific_label_row.initialise_labels()

# Print the labels as JSON
print(json.dumps(specific_label_row.to_encord_dict()))

Remove Labels from Text Files


from encord import EncordUserClient
import json

SSH_PATH= "<file-path-to-ssh-private-key>"
PROJECT_ID= "<project-unique-id>"
DATA_UNIT_NAME= "<file-name-of-html-file>"

# Instantiate client. Replace <private_key_path> with the path to the file containing your private key.
user_client = EncordUserClient.create_with_ssh_private_key(
    ssh_private_key_path=SSH_PATH
)

# Specify Project. Replace <project_hash> with the hash of the Project you want to export labels for.
project = user_client.get_project(PROJECT_ID)

# Specify the data unit you want to export labels for. Replace <file_name> with the name of your specific data unit.
specific_label_row = project.list_label_rows_v2(
    data_title_eq=DATA_UNIT_NAME
)[0]


object_to_remove = None
specific_label_row.initialise_labels()
for object_instance in specific_label_row.get_object_instances():
    if object_instance.object_hash == '<label-unique-id>':
        object_to_remove = object_instance

specific_label_row.remove_object(object_to_remove)

specific_label_row.save()


I