Title	Annotated output standard example

URL Name	000004168

Audience	Public
Product (Internal List) aiWare - aiWare
Body
Below is an annotated example of the full specification, demonstrating most of the capabilities:
sample = {
  /**
   * PREAMBLE
   * The preamble contains various high-level information for this vtn-standard document.
   */
  // Schema version to validate engine outputs against (optional)
  "schemaId": "https://docs.veritone.com/schemas/vtn-standard/master.json",
  // Denotes the engine that created it (optional, provided by Veritone)
  "sourceEngineId": "<GUID>",
  // Engine name used to generate output (optional, provided by Veritone)
  "sourceEngineName": "engine_x",
  // Task payload describing the associated tasks that summon the engine (optional, provided by Veritone)
  "taskPayload": {
    // "key": value pairs from the payload for this task
  },
  // The associated task (optional, provided by Veritone)
  "taskId": "<TASK_ID>",
  // Date this document was generated (optional, set by Veritone if not included)
  // Format: ISO8601
  "generatedDateUtc": "2017-12-08T17:19:02Z",
  // Vendor specific reference.  Used to map engine output against vendor referenced data ID (optional)
  "externalSourceId": "<string>",
  // Specification for the contracts used for output validation (optional)
  // See http://docs.veritone.com/#/engines/engine_standards/capability/ for more information
  "validationContracts": [
    "text", "face", // ...
  ],
  /**
   * OVERALL FILE DATA
   * Data in this section applies to the file being analyzed as a whole.
   * This is a commonly used section for files with no time spans like
   * images or text documents, or for expressing summary data that spans
   * the entire length of a media file.
   *
   * For data that is specific to a particular object or a particular
   * point in time inside the file, see the lower sections.
   */
  // Tags associated with this file (optional)
  // Format: { "key": "<name>", "value": "<value>" }
  // - For ground truth:  Set tag to be "groundTruth": "<provider>"
  // - For content moderation, Key must be: moderation:adult, moderation:violence, moderation:nsfw,
  //   moderation:nudity, moderation:fakeNews, moderation:pii
  // - For gender: gender[value=male|female]
  "tags": [{
    "key": "foo",
    "value": "bar", // OPTIONAL.  If not specified, defaults to true
    "score": 0.12 // OPTIONAL
  }, {
    "key": "foo",
    "value": "bar2"
  }],
  // Collation order (optional)
  // Indicates relative order of this file. Files with lower collateIndex values come before higher collateIndex values.
  // Used to preserve ordering, esp. when parallel processing large documents. Indexes do not have to be sequential, but may not be duplicated.
  // You may use whatever integers are convenient: indexes, page number, byte offsets, etc.
  "collateIndex": 4,
  // Language Identification (optional)
  // Format: BCP-47 https://tools.ietf.org/rfc/bcp/bcp47.txt
  "language": "en-US",
  // Summary of document (optional)
  "summary": "",
  // Sentiment (optional)
  // Provides a rating of how positive and/or negative some text is. 
  // Text can be purely positive ("I love you."), negative ("This tastes rotten."), or mixed ("I appreciate that you tried, but this is a failure.")
  // Values: 0.0 (neutral) to 1.0 (extreme). Confidence: 0.0 (not) to 1.0 (confident)
  "sentiment": {
    "positiveValue": 0.12, // REQUIRED if negativeValue is not provided or positiveConfidence is provided
    "positiveConfidence": 0.12, // OPTIONAL
    "negativeValue": 0.12, // REQUIRED if positiveValue is not provided or negativeConfidence is provided
    "negativeConfidence": 0.12 // OPTIONAL
  },
  // GPS coordinates for this file (optional)
  // Format: UTM (preferred) | WGS 1984
  "gps": [{
    "latitude": 59.123,
    "longitude": 213.123,
    "precision": 100, //in meters
    "direction": 10.1, // 0-360
    "velocity": 100.00, //in meters
    "altitude": 123.12 //in meters
  }],
  // Emotions (optional)
  // Can be specified for whole file (here) for overall tone,
  // in an object (e.g. face recognition),
  // in series (e.g. for transcript/sentiment), or
  // in series.object (e.g. for time-specific face recognition)
  "emotions": [{
    "emotion": "angry", // STRING: angry, happy, sad.  Can be any string field.
    "emotionValue": 0.12, // OPTIONAL: How strong.  0 = none, 1.0 = 100%
    "emotionConfidence": 0.88 // OPTIONAL: 0 = 0%, 1.0 = 100%
  }],
  /**
   * OVERALL FILE OBJECTS
   * Data in this section applies to things (e.g. faces, objects, logos, OCR)
   * detected in the file but not in a specific time range.
   */
  // Object (Face, Object, Logo, OCR, ..) (optional)
  "object": [{
    // Object type (REQUIRED)
    // Options:
    // - object: Object detection
    // - face: Face detection
    // - facial-features: Facial Features
    // - licensePlate: License plate detection
    // - logo: Logo detection
    // - speaker: Speaker recognition
    // - sound: Sound recognition
    // - concept: Concept recognition
    // - keyword: Keyword detection
    // - text: Recognized or extracted text (OCR / text extraction)
    // - namedEntity: Entity extraction
    // - face-verification: Face verification
    // - speaker-verification: Speaker (voice) verification
    "type": "object",
    // Main label for this object (optional)
    // REQUIRED if no other identifying information (e.g. text, entityId) is specified
    "label": "dog",
    // URI to thumbnail to show in the UI (optional)
    // If not provided but boundingPoly is provided,
    // one can be constructed dynamically from the boundingPoly.
    "uri": "<URI>",
    // Entity reference (optional)
    "entityId": "<GUID>",
    "libraryId": "<GUID>",
    // Confidence score (optional)
    "confidence": 0.99234, // 0-1
    // Text found (optional)
    // REQUIRED for OCR and text extraction
    "text": "The quick brown fox jumped over the lazy dog",
    // Collation order (optional)
    // Indicates relative order of this object. Objects with lower collateIndex values come before higher collateIndex values.
    // Used to preserve ordering, esp. when parallel processing. Indexes do not have to be sequential, but may not be duplicated.
    // You may use whatever integers are convenient: indexes, frame numbers, byte offsets, etc.
    "collateIndex": 4,
    // Document location (optional)
    // For referencing where in a document recognized text or entities or occur.
    // It is highly recommended to define at least one to ensure proper ordering for indexing.
    // For non-paginated document types like plain text files you can simply enumerate paragraphs based on line breaks.
    "page": 5,
    "paragraph": 3,
    "sentence": 2,
    // Used for verification engines (optional)
    // Valid inputs: "enroll" or "verify"
    "mode": "verify",  // REQUIRED
    // Transcription (optional)
    // An auxiliary output used for speaker verification engines. Provides a confidence score between the
    // transcribed audio and a specified phrase.
    "transcription": {
      "text": "hello world",  // OPTIONAL
      "confidence": 0.80      // REQUIRED
    },
    // Lip Voice Correlation (optional)
    // Used for the facial-features engine.
    "lipVoiceCorrelation": {
      "confidence": 0.9   // REQUIRED
    },
    // Lip Voice Correlation (optional)
    // Used for the facial-features engine.
    "lipMovement": {
      "confidence": 1.5   // REQUIRED
    },
    // Sentiment (optional)
    // Provides a rating of how positive and/or negative some text is. 
    // Text can be purely positive ("I love you."), negative ("This tastes rotten."), or mixed ("I appreciate that you tried, but this is a failure.")
    // Values: 0.0 (neutral) to 1.0 (extreme). Confidence: 0.0 (not) to 1.0 (confident)
    "sentiment": {
      "positiveValue": 0.12, // REQUIRED if negativeValue is not provided or positiveConfidence is provided
      "positiveConfidence": 0.12, // OPTIONAL
      "negativeValue": 0.12, // REQUIRED if positiveValue is not provided or negativeConfidence is provided
      "negativeConfidence": 0.12 // OPTIONAL
    },
    // Emotions (optional)
    // For an object (e.g. face detection, voice analysis, text analysis) in the whole file
    "emotions": [{
      "emotion": "angry", // STRING: angry, happy, sad.  Can be any string field.
      "emotionValue": 0.12, // OPTIONAL: How strong.  0 = none, 1.0 = 100%
      "emotionConfidence": 0.88 // OPTIONAL: 0 = 0%, 1.0 = 100%
    }],
    // Age in years (optional)
    "age": {
      "min": 20,
      "max": 50,
      "confidence": 0.2 // 0-1
    },
    // Face landmarks (optional)
    "faceLandmarks": [{
      "type": "mouth",
      // Ordered array of (x,y) coordinates in percentage of axis
      // Implicit line from last to first
      "locationPoly": [{
        "x": 0.1,
        "y": 0.2
      }],
    }],
    // Object detection / keyword detection (optional)
    "objectCategory": [{
      "class": "animal",
      "@id": "kg:/m/0dl567",
      "confidence": 0.567
    }],
    // Specifies the region match was found (optional)
    // Valid values: "left", "right", "top", "bottom"
    "region": "left",
    // Bounding polygon (optional)
    // Ordered array of (x,y) coordinates in percentage of axis
    // Implicit line from last to first
    "boundingPoly": [{
      "x": 0.1,
      "y": 0.2
    }],
    // GPS coordinates for this object (optional)
    // Format: UTM (preferred) | WGS 1984
    "gps": [{
      "latitude": 59.123,
      "longitude": 213.123,
      "precision": 100, //in meters
      "direction": 10.1, // 0-360
      "velocity": 100.00, //in meters
      "altitude": 123.12 //in meters
    }],
    // Structured data values for this object (optional)
    "structuredData": {
      "<schemaGuid>": { // GUID of the aiWARE schema ID this structured data object conforms to
        "<key>": "<value>",
        // ...
        "<keyN>": "<value>",
      }
    },
    // Custom data for this object (optional)
    // You can add any arbitrary data inside this object.
    // It will not be indexed, searchable, or have any impact on the system.
    // But it will be returned when reading the data back out.
    "vendor": {
      // custom key:value pairs...
    }
  }], // END OBJECT
  // Media (for linking to files when the engine's cognition results in file outputs)
  "media": [{
    "assetId": "<ID of the associated asset>",
    // Content Type (optional)
    // Must be a valid MIME type (see https://www.iana.org/assignments/media-types/media-types.xhtml)
    "contentType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    // Language Identification (optional)
    // Format: BCP-47 https://tools.ietf.org/rfc/bcp/bcp47.txt
    "language": "en"
  }],
  // Custom data for this document (optional)
  // You can add any arbitrary data inside this object.
  // It will not be indexed, searchable, or have any impact on the system.
  // But it will be returned when reading the data back out.
  "vendor": {},
  /**
   * TIME SERIES DATA
   * Data in this section applies to a specific time ranges within the file.
   * This is the most common section used for insights from audio and video files.
   */
  // Series (optional)
  "series": [{
    // Start and stop times (REQUIRED)
    // Time span in milliseconds (relative to the source asset start) of this time slice
    "startTimeMs": 1260,
    "stopTimeMs": 1360,
    // Tags associated with this time slice (optional)
    // Format: { "key": "<name>", "value": "<value>" }
    // - For speech detected: speech=true
    // - For silence detected: silence=true
    // - For partial output: partial=true
    // - For ground truth:  Set tag to be "groundTruth": "<provider>"
    // - For content moderation, Key must be: moderation:adult, moderation:violence, moderation:nsfw,
    //   moderation:nudity, moderation:fakeNews, moderation:pii
    // - For gender: gender[value=male|female]
    "tags": [{
      "key": "foo",
      "value": "bar", // OPTIONAL.  If not specified, defaults to true
      "score": 0.12 // OPTIONAL
    }, {
      "key": "foo",
      "value": "bar2"
    }],
    // Summary of time slice (optional)
    "summary": "",
    // Speaker identification (optional)
    // Example: "channel0", "speaker1", ...
    "speakerId": "<Speaker Identifier>", // can be "<libraryId>:<entityId>"
    // Optional
    // Transcript (optional)
    // JSON utterance (all word edges between 2 time nodes)
    // Array of n objects describing each alternative word
    "words": [{
      // The word spoken (required)
      "word": "!silence",
      // The confidence level of the detected word spoken (optional)
      // Range should be from: 0.0 - 1.00
      "confidence": 0.794,
      // Is this word included in the best path through a transcript lattice? (optional)
      "bestPath": true,
      // Number of consecutive time-slices the utterance spans (optional)
      // example: of->thrones----->
      //          of->their-->own->
      // utteranceLength: thrones: 2; their,own: 1
      "utteranceLength": 1
    }],
    // Language Identification (optional)
    // Format: BCP-47 https://tools.ietf.org/rfc/bcp/bcp47.txt
    "language": "en-US",
    // Sentiment (optional)
    // Provides a rating of how positive and/or negative some text is. 
    // Text can be purely positive ("I love you."), negative ("This tastes rotten."), or mixed ("I appreciate that you tried, but this is a failure.")
    // Values: 0.0 (neutral) to 1.0 (extreme). Confidence: 0.0 (not) to 1.0 (confident)
    "sentiment": {
      "positiveValue": 0.12, // REQUIRED if negativeValue is not provided or positiveConfidence is provided
      "positiveConfidence": 0.12, // OPTIONAL
      "negativeValue": 0.12, // REQUIRED if positiveValue is not provided or negativeConfidence is provided
      "negativeConfidence": 0.12 // OPTIONAL
    },
    // Emotions detected (optional)
    "emotions": [{
      "emotion": "angry", // STRING: angry, happy, sad.  Can be any string value.
      "emotionValue": 0.12, // OPTIONAL: How strong.  0 = none, 1.0 = 100%
      "emotionConfidence": 0.88 // OPTIONAL: 0 = 0, 1.0 = 100%
    }],
    // Entity reference (optional)
    "entityId": "<GUID>",
    "libraryId": "<GUID>",
    // Object (Face, Object, Logo, OCR, ..) (optional)
    "object": {
      // Object type (REQUIRED)
      // Options:
      // - object: Object detection
      // - face: Face detection
      // - licensePlate: License plate detection
      // - logo: Logo detection
      // - fingerprint: Audio fingerprinting
      // - speaker: Speaker recognition
      // - sound: Sound recognition
      // - concept: Concept recognition
      // - keyword: Keyword detection
      // - text: Recognized or extracted text (OCR / text extraction)
      // - namedEntity: Entity extraction
      // - barcode
      "type": "object",
      // Main label for this object (optional)
      // REQUIRED if no other identifying information (e.g. text, entityId) is specified
      "label": "cat",
      // URI to thumbnail to show in the UI (optional)
      // If not provided but boundingPoly is provided,
      // one can be constructed dynamically from the boundingPoly.
      "uri": "<URI>",
      // Entity reference (optional)
      "entityId": "<GUID>",
      "libraryId": "<GUID>",
      // Confidence score (optional)
      "confidence": 0.99234, // 0-1
      // Text found (optional)
      // REQUIRED for OCR and text extraction
      "text": "The quick brown fox jumped over the lazy dog",
      // Emotions (optional)
      // For an object (e.g. face detection, voice analysis, text analysis) in the series
      "emotions": [{
        "emotion": "angry", // STRING: angry, happy, sad.  Can be any string field.
        "emotionValue": 0.12, // OPTIONAL: How strong.  0 = none, 1.0 = 100%
        "emotionConfidence": 0.88 // OPTIONAL: 0 = 0, 1.0 = 100%
      }],
      // Age in years (optional)
      "age": {
        "min": 20,
        "max": 50,
        "confidence": 0.2 // 0-1
      },
      // Face landmarks (optional)
      "faceLandmarks": [{
        "type": "mouth",
        // Ordered array of (x,y) coordinates in percentage of axis
        // Implicit line from last to first
        "locationPoly": [{
          "x": 0.1,
          "y": 0.2
        }],
      }],
      // Object detection / keyword detection (optional)
      "objectCategory": [{
        "class": "animal",
        "@id": "kg:/m/0dl567",
        "confidence": 0.567
      }],
      // Specifies the region match was found (optional)
      // Valid values: "left", "right", "top", "bottom"
      "region": "left",
      // Bounding polygon (optional)
      // Ordered array of (x,y) coordinates in percentage of axis
      // Implicit line from last to first
      "boundingPoly": [{
        "x": 0.1,
        "y": 0.2
      }],
      // GPS coordinates for this object (optional)
      // Format: UTM (preferred) | WGS 1984
      "gps": [{
        "latitude": 59.123,
        "longitude": 213.123,
        "precision": 100, //in meters
        "direction": 10.1, // 0-360
        "velocity": 100.00, //in meters
        "altitude": 123.12 //in meters
      }],
      // Structured data values for this object (optional)
      "structuredData": {
        "<schemaGuid>": { // GUID of the aiWARE schema ID this structured data object conforms to
          "<key>": "<value>",
          // ...
          "<keyN>": "<value>",
        }
      },
      // Custom data for this object (optional)
      // You can add any arbitrary data inside this object.
      // It will not be indexed, searchable, or have any impact on the system.
      // But it will be returned when reading the data back out.
      "vendor": {
        // custom key:value pairs...
      }
    }, // END OBJECT
    // GPS coordinates for this time-series entry (optional)
    // Format: UTM (preferred) | WGS 1984
    "gps": [{
      "latitude": 59.123,
      "longitude": 213.123,
      "precision": 100, //in meters
      "direction": 10.1, // 0-360
      "velocity": 100.00, //in meters
      "altitude": 123.12 //in meters
    }],
    // Structured data values for this time-series entry (optional)
    "structuredData": {
      "<schemaGuid>": { // GUID of the aiWARE schema ID this structured data object conforms to
        "<key>": "<value>",
        // ...
        "<keyN>": "<value>",
      }
    },
    // Media (for linking to files when the engine's cognition results in file outputs)
    "media": {
      "assetId": "<ID of the associated asset>",
      // Content Type (optional)
      // Must be a valid MIME type (see https://www.iana.org/assignments/media-types/media-types.xhtml)
      "contentType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
      // Language Identification (optional)
      // Format: BCP-47 https://tools.ietf.org/rfc/bcp/bcp47.txt
      "language": "en"
    },
    // Custom data for this time-series entry (optional)
    // You can add any arbitrary data inside this object.
    // It will not be indexed, searchable, or have any impact on the system.
    // But it will be returned when reading the data back out.
    "vendor": {
      // custom key:value pairs...
    }
  }]
};
Created Date	8/6/2024 9:51 PM

Last Modified Date	8/7/2024 9:47 PM

Last Published Date	8/7/2024 9:47 PM

Article Record Type	Documentation

Veritone Record Type	Documentation

Article Number	000004168
Annotated output standard example