Nemotron Nano Structured Data
Generate multi-format structured data (JSON, YAML, XML, Markdown) with controlled schema complexity, conversational grounding, and best-of-3 candidate generation. Used for Nemotron Nano structured-output SFT training.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "data-designer",
# ]
# ///
"""Nemotron Nano Structured Data Recipe: Multi-Format Schema Generation
Generate synthetic structured data across multiple output formats (JSON, YAML,
XML, Markdown) with controlled schema complexity, conversational grounding,
and best-of-3 candidate generation.
This recipe implements the pipeline used to produce structured-data SFT records
for Nemotron Nano training. Each record contains a generated schema, a natural
user request, grounding Q&A conversation pairs, and three candidate structured
outputs that conform to the schema.
Pipeline architecture:
┌─────────────────────────────────────────────────────────────────────────┐
│ STAGE 1: SAMPLING (DIVERSITY CONTROLS) │
│ │
│ Format Controls Schema Controls Conversation Controls │
│ ├─ output_format ├─ schema_rigidity ├─ num_turns │
│ └─ topic (conditional) ├─ schema_fields_count ├─ tone │
│ ├─ schema_complexity └─ detail_level │
│ └─ nesting_depth │
├────────────────────────────────────────────────────────────────────────┤
│ STAGE 2: SCHEMA GENERATION (LLM) │
│ Generates format-specific schema from sampled controls + topic. │
├────────────────────────────────────────────────────────────────────────┤
│ STAGE 3: USER PROMPT GENERATION (LLM) │
│ Natural-language request matching tone and detail level. │
├────────────────────────────────────────────────────────────────────────┤
│ STAGE 4: CONVERSATION PAIRS (LLM) │
│ Q&A pairs covering schema facts for grounding. │
├────────────────────────────────────────────────────────────────────────┤
│ STAGE 5: STRUCTURED OUTPUT (LLM, best-of-3) │
│ Three candidate structured outputs conforming to the schema. │
└────────────────────────────────────────────────────────────────────────┘
Prerequisites:
- NVIDIA_API_KEY environment variable for NVIDIA provider model aliases.
Run:
uv run structured_data.py
uv run structured_data.py --num-records 100 --output-format json
uv run structured_data.py --help
"""
from __future__ import annotations
from pathlib import Path
import data_designer.config as dd
from data_designer.interface import DataDesigner, DatasetCreationResults
# =============================================================================
# Topics: representative subset of categories and subtopics
# =============================================================================
TOPICS: dict[str, list[str]] = {
"Leisure Activities": [
"Outdoor Recreation",
"Board Games",
"DIY Crafts",
"Photography",
"Gardening",
],
"Daily Life": [
"Morning Routines",
"Grocery Shopping",
"Commuting",
"Household Chores",
"Meal Planning",
],
"Education and Learning": [
"Online Courses",
"Study Techniques",
"Language Learning",
"STEM Education",
"Library Systems",
],
"Technology and Gadgets": [
"Smartphones",
"Smart Home Devices",
"Wearable Tech",
"Cloud Computing",
"Cybersecurity Basics",
],
"Health and Wellness": [
"Nutrition Planning",
"Mental Health",
"Exercise Routines",
"Sleep Hygiene",
"Preventive Care",
],
"Finance and Money": [
"Personal Budgeting",
"Investment Basics",
"Tax Preparation",
"Credit Management",
"Retirement Planning",
],
"Food and Cooking": [
"Baking Techniques",
"Meal Prep",
"International Cuisines",
"Dietary Restrictions",
"Kitchen Equipment",
],
"Travel and Transportation": [
"Trip Planning",
"Public Transit",
"Road Trips",
"Travel Insurance",
"Packing Strategies",
],
"Arts and Culture": [
"Music Theory",
"Film Analysis",
"Theater Production",
"Contemporary Art",
"Creative Writing",
],
"Work and Careers": [
"Resume Building",
"Interview Preparation",
"Remote Work",
"Project Management",
"Career Transitions",
],
}
# =============================================================================
# Prompts
# =============================================================================
SCHEMA_GENERATION_PROMPT = """\
Create a schema for a structured object response in the format {{ output_format }}.
Controls:
- Rigidity: {{ schema_rigidity }}
- Top-level properties: {{ schema_fields_count }}
- Complexity: {{ schema_complexity }}
- Nesting depth: {{ nesting_depth }}
- Topic: {{ topic_category }} / {{ topic_subtopic }}
Instructions:
- Output only an object with keys: "name", "schema", and "strict", formatted as \
{{ output_format }}.
- "name" must be appropriate with the Topic: {{ topic_category }} / {{ topic_subtopic }}
- "schema" should be a valid structured schema as specified in {{ output_format }}.
- Use {{ schema_fields_count }} top-level properties, relevant to the topic.
- Include at least one boolean and, if appropriate, one enum.
- All top-level properties must be listed in "required".
- Set "additionalProperties": false at every object level.
- If {{ schema_complexity }} is "complex", make the schema deeply nested: at least two \
levels of nested objects, with at least one object nested three levels deep. Keep nesting \
relevant to the topic.
- If "simple", keep nesting minimal or flat.
- "strict" must be true.
Formatting by output_format:
- "json": Output a valid JSON object, no code fences or comments.
- "yaml": Output a valid YAML object, no code fences or comments.
- "xml": Output a valid XML document with root "root" and child elements "name", "schema", \
and "strict". "schema" can be a string or nested XML.
- "markdown": Output a Markdown code block with the JSON object, using triple backticks and \
"json" as the language, no extra text.
Output only the object in the specified format. No explanations or extra text.
"""
USER_PROMPT_GENERATION = """\
You are a human user asking an AI assistant to produce a structured output. Write a natural, \
concise request that would lead to filling in a schema about {{ topic_category }} / \
{{ topic_subtopic }}.
The request should:
- Sound like something a real person would type or say
- Describe what data they want without exposing the schema itself
- Mention the desired output format: {{ output_format }}
- Match the tone: {{ tone }} and detail level: {{ detail_level }}
Do not include the schema, code fences, or technical formatting. Just the user request.
"""
CONVERSATION_PROMPT = """\
Write a short Q&A conversation about the following topic. Follow the selected JSON Schema \
fields as the underlying facts to cover, but DO NOT output JSON here.
Topic context:
- Category: {{ topic_category }}
- Subtopic: {{ topic_subtopic }}
Constraints:
- Number of Q&A pairs: {{ num_turns }}
- Tone: {{ tone }}
- Detail level: {{ detail_level }}
Write alternating question/answer pairs that make these facts unambiguous for the chosen \
schema: {{ structured_schema }}
Return only a Python list of [question, answer] pairs (no extra text).
"""
STRUCTURED_OUTPUT_PROMPT = """\
You will produce a {{ output_format }} instance that conforms strictly to the following \
schema (no extra keys).
Schema:
{{ structured_schema }}
You are given a Python list of [question, answer] pairs:
{{ conversation_pairs }}
Instructions:
- Derive values only from the answers given.
- Render ONLY the {{ output_format }} instance, with no commentary.
- Formatting rules:
- If output_format is "json", output a single JSON object (no code fences).
- If output_format is "yaml", output a YAML mapping (no code fences).
- If output_format is "xml", output an XML document with root <scene_response>.
- If output_format is "markdown", output a fenced code block with ```json.
- Ensure the content validates against the schema when parsed back to JSON.
"""
# =============================================================================
# Supported output formats
# =============================================================================
OUTPUT_FORMATS = ["json", "yaml", "xml", "markdown"]
# =============================================================================
# Pipeline builder
# =============================================================================
def build_config(
model_alias: str,
output_format: str | None = None,
) -> dd.DataDesignerConfigBuilder:
config_builder = dd.DataDesignerConfigBuilder()
# ── Stage 1: Sampling ────────────────────────────────────────────────
config_builder.add_column(
dd.SamplerColumnConfig(
name="record_id",
sampler_type=dd.SamplerType.UUID,
params=dd.UUIDSamplerParams(prefix="SD-", short_form=True, uppercase=True),
)
)
formats = [output_format] if output_format else OUTPUT_FORMATS
config_builder.add_column(
dd.SamplerColumnConfig(
name="output_format",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(values=formats),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="topic_category",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(values=list(TOPICS.keys())),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="topic_subtopic",
sampler_type=dd.SamplerType.SUBCATEGORY,
params=dd.SubcategorySamplerParams(
category="topic_category",
values=TOPICS,
),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="schema_rigidity",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(values=["strict", "moderate"]),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="schema_fields_count",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(
values=["3", "4", "5", "6", "7", "8", "9", "10", "11", "12"],
),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="schema_complexity",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(values=["simple", "complex"]),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="nesting_depth",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(values=["1", "2", "3", "4"]),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="num_turns",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(values=["2", "3", "4", "5", "6", "7", "8"]),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="tone",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(values=["neutral", "enthusiastic", "factual"]),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="detail_level",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(
values=["brief", "standard", "detailed", "super verbose"],
),
)
)
# ── Stage 2: Schema generation ───────────────────────────────────────
config_builder.add_column(
dd.LLMTextColumnConfig(
name="structured_schema",
prompt=SCHEMA_GENERATION_PROMPT,
model_alias=model_alias,
)
)
# ── Stage 3: User prompt generation ──────────────────────────────────
config_builder.add_column(
dd.LLMTextColumnConfig(
name="user_prompt",
prompt=USER_PROMPT_GENERATION,
model_alias=model_alias,
)
)
# ── Stage 4: Conversation pairs ──────────────────────────────────────
config_builder.add_column(
dd.LLMTextColumnConfig(
name="conversation_pairs",
prompt=CONVERSATION_PROMPT,
model_alias=model_alias,
)
)
# ── Stage 5: Best-of-3 structured output ─────────────────────────────
for i in range(3):
config_builder.add_column(
dd.LLMTextColumnConfig(
name=f"structured_output_{i}",
prompt=STRUCTURED_OUTPUT_PROMPT,
model_alias=model_alias,
)
)
return config_builder
# =============================================================================
# Dataset creation
# =============================================================================
def create_dataset(
config_builder: dd.DataDesignerConfigBuilder,
num_records: int,
artifact_path: Path | str | None = None,
) -> DatasetCreationResults:
data_designer = DataDesigner(artifact_path=artifact_path)
results = data_designer.create(config_builder, num_records=num_records)
return results
if __name__ == "__main__":
from argparse import ArgumentParser
parser = ArgumentParser(description="Nemotron Nano Structured Data Recipe")
parser.add_argument("--model-alias", type=str, default="nvidia-text")
parser.add_argument("--num-records", type=int, default=5)
parser.add_argument("--artifact-path", type=str, default=None)
parser.add_argument(
"--output-format",
type=str,
default=None,
choices=OUTPUT_FORMATS,
help="Generate for a single output format (default: all formats)",
)
args = parser.parse_args()
config_builder = build_config(
model_alias=args.model_alias,
output_format=args.output_format,
)
results = create_dataset(
config_builder,
num_records=args.num_records,
artifact_path=args.artifact_path,
)
print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")
results.load_analysis().to_report()