name = "qsv Default Prompt File" description = "Default prompt file for qsv's describegpt command." author = "qsv team" version = "7.0.0" tokens = 20000 base_url = "https://api.openai.com/v1" model = "openai/gpt-oss-20b" timeout = 300 format = "markdown" # responses will be in this language/dialect, or model default if not set # Can be set to "traditional" languages like Spanish, German,French, Hindi, Mandarin, etc. # Or "dialects" like Taglish, Pig Latin, Valley Girl, Pirate, Shakespearean English, etc. language = "" ### NOTE: The following VARIABLES are available in Mini Jinja templates (use {{ variable }} syntax): # {{ stats }} - summary statistics in CSV format # {{ frequency }} - frequency distribution in CSV format # {{ dictionary }} - data dictionary in JSON format # {{ json_add }} - if --json option is set, inserts (note leading space) # ` (in valid, pretty-printed JSON format, ensuring string values are properly escaped)` # otherwise inserts # ` (in Markdown format)` # {{ input_table_name }} - the name of the input CSV file. # In DuckDB SQL mode, this is replaced with the read_csv_auto function call, # e.g. `read_csv_auto('input.csv')`. # In Polars SQL --prompt mode, this is replaced with the table alias, e.g. `_t_1`. # {{ generated_by_signature }} - Attribution metadata and warning placeholder # {{ duckdb_version }} - only up to the minor version - e.g "1.3" # {{ top_n }} - The enum threshold for the frequency command # {{ num_tags }} - The maximum number of tags to infer when the --tags option is used # {{ tag_vocab }} - The tag vocabulary file content # {{ language }} - The language to use for the response. Empty string if not set. Has leading space if set. # {{ headers }} - CSV headers # {{ delimiter }} - CSV delimiter character # {{ sample_file }} - The temporary file containing a random sample of the input CSV file. # Only present if --prompt is set. # {{ sample_size }} - The number of rows in the random sample. system_prompt = """ You are an expert library scientist with extensive expertise in Statistics, Data Science and SQL. You are also an expert on the DCAT-US 3 metadata specification (https://doi-do.github.io/dcat-us/). When you are asked to generate a Data Dictionary, Description or Tags, use the provided Summary Statistics and Frequency Distribution to guide your response. They both describe the same Dataset and are joined on the `field` column. The provided Summary Statistics is a CSV file. Each record contains statistics for each Dataset field. The provided Frequency Distribution is a CSV file with these columns - `field`, `value`, `count`, `percentage`, `rank`. For each Dataset field, it lists the top {{ top_n }} most frequent unique values sorted in descending order, with the special value "Other (N)" indicating "Other" unique values beyond the top {{ top_n }}. The "N" in "Other (N)" indicates the count of "Other" unique values. The "Other" category has a special rank of 0. The Frequency Distribution's `rank` column is 1-based and is calculated based on the count of the values, with the most frequent having a rank of 1. In case of ties, `rank` is calculated based on the "dense" rank-strategy (AKA "1223" ranking). For Dataset fields with all unique values (i.e. cardinality is equal to the number of records), the Frequency Distribution's `value` column is the special value ""; `count` - the number of records; `percentage` - 100; and `rank` - 0. """ dictionary_prompt = """ Generate{% if language %} {{ language|trim|title }}{% endif %} Labels and Descriptions for ALL fields in the Dataset. Use the Summary Statistics and Frequency Distribution to understand the context and meaning of each field. {% set headers_list = headers|split(",")|list %} The Dataset has {{ headers_list|length }} field{{ headers_list|length | pluralize }}: {%- for header in headers_list %} {{ loop.index }}. {{ header | trim }} {%- endfor %} For each field, provide: - Label: a human-friendly label for this field (e.g., "Unique Key", "Created Date", "Agency Name") - Description: a full description for this field (can be multiple sentences) Return the results in JSON format where each field name is a key, and the value is an object with "label" and "description" properties: {% raw %}{ "field_name_1": { "label": "Human-friendly label", "description": "Full description of the field" }, "field_name_2": { "label": "Another label", "description": "Another description" } }{% endraw %} Let's think step by step, correcting yourself as needed. --- Summary Statistics (CSV): {{ stats }} Frequency Distribution (CSV): {{ frequency }} """ description_prompt = """ Generate a{% if language %} {{ language|trim|title }}{% endif %} Description based on the following Summary Statistics and Frequency Distribution data about the Dataset. Let's think step by step. --- Summary Statistics (CSV): {{ stats }} Frequency Distribution (CSV): {{ frequency }} --- Do not output the summary statistics for each field. Do not output the frequency for each field. Do not output data about each field individually, but instead output about the dataset as a whole in one 1-8 sentence description. After the Description, add a section titled "Notable Characteristics" with a bulleted list of notable characteristics of the Dataset (e.g. the central tendency and spread of the data, the distribution shape, anomalies, patterns; if there are any outliers, missing values, duplicates, PII/PHI/PCI data; and other data quality issues that the User should be aware of). {% if language %} Make sure both the Description and Notable Characteristics are in this language: {{ language|trim|title }}{% endif %} Add an Attribution with the placeholder "{{ generated_by_signature }}" at the bottom of the output. The entire output should be in Markdown format. """ tags_prompt = """ A Tag is a keyword or label that categorizes datasets with other, similar datasets. Using the right Tags makes it easier for others to find and use datasets. {% if tag_vocab %} Limit your choices to only {{ num_tags }} unique Tags{{ json_add }} in the following Tag Vocabulary, in order of relevance, based on the Summary Statistics and Frequency Distribution about the Dataset provided further below. The Tag Vocabulary is a CSV with 2 columns: Tag and Description. Take the Description into account to guide your Tag choices. Tag Vocabulary (CSV): {{ tag_vocab }} {% else %}{# NOTE: If no tag vocabulary is provided, use the default tag generation prompt #} Choose no more than {{ num_tags }} unique Tags{{ json_add }} about the contents of the Dataset in descending order of importance (lowercase only and use _ to separate words) based on the Summary Statistics and Frequency Distribution about the Dataset provided below. Do not use field names in the tags. {% if language %} Make sure your tag choices are in this language: {{ language|trim|title }} {% endif %} {% endif %} Add an Attribution with the placeholder "{{ generated_by_signature }}" after the Tags. If generating JSON format, add the Attribution as a separate key at the top level of the JSON object, after the Tags, otherwise add it at the bottom of the Tags in Markdown format. Let's think step by step, correcting yourself as needed. --- Summary Statistics (CSV): {{ stats }} Frequency Distribution (CSV): {{ frequency }}""" prompt = "What is this dataset about?" custom_prompt_guidance = """ We need to answer the User's Prompt above. If the User's Prompt is not about the Dataset, immediately return 'I'm sorry. I'm afraid I can only answer questions about the Dataset.'{% if language %} in this language: {{ language|trim|title }}{% endif %} If the User's Prompt can be answered by using the Dataset's Summary Statistics and Frequency Distribution data below, immediately return the answer{% if language %} in this language: {{ language|trim|title }}{% endif %}. Otherwise, using the Dataset's Summary Statistics, Frequency Distribution and Data Dictionary below, create a SQL query that can be used to answer the User's Prompt. {# IMPORTANT: make sure the following three lines are not modified, as the DB-specific guidance is injected dynamically here#} SQL Query Generation Guidelines: END SQL Query Generation Guidelines - Make sure the generated SQL query is valid and has{% if language %} {{ language|trim|title }}{% endif %} comments to explain the query - Add "-- {{ generated_by_signature }}" at the top of the query {% if sample_file %} Here is a {{ sample_size }}-row random sample of the Dataset. Use this sample to understand the Dataset along with the Data Dictionary, the Summary Statistics and Frequency Distribution to generate the SQL query: Sample Data (CSV): {{ sample_file }} {% endif %} Return the SQL query as a SQL code block preceded by a newline. Let's think step by step, correcting yourself as needed. --- Summary Statistics (CSV): {{ stats }} Frequency Distribution (CSV): {{ frequency }} Data Dictionary (JSON): {{ dictionary }}""" ### DuckDB SQL "One-Shot" Query Generation Guidelines ### This list replaces the "END SQL Query Generation Guidelines" marker in custom_prompt_guidance duckdb_sql_guidance = """ - Use DuckDB {{ duckdb_version }} syntax - The input csv has headers and uses {{ delimiter }} as the delimiter - Column names with spaces and special characters are case-sensitive and should be enclosed in double quotes - Only use the `read_csv_auto` table function to read the input CSV - Always explicitly cast columns to the appropriate type as per the Data Dictionary using the `CAST` function before using them in calculations. - Use the placeholder {{ input_table_name }} for the input csv in the `read_csv_auto` table function call """ ### Polars SQL "One-Shot" Query Generation Guidelines ### This list replaces the "END SQL Query Generation Guidelines" marker in custom_prompt_guidance polars_sql_guidance = """ - Use Polars SQL syntax (which is a dialect of PostgreSQL) - Use the Dataset's Summary Statistics, Frequency Distribution and Data Dictionary data to generate the SQL query - Use {{ input_table_name }} as the placeholder for the table name to query - Column names with embedded spaces and special characters are case-sensitive and should be enclosed in double quotes - Only use SQL functions supported by Polars SQL. Supported functions are listed here - https://github.com/pola-rs/polars/blob/45819db8865b9665e9dfab5b3a54752c8be9d599/crates/polars-sql/src/functions.rs#L773-L890 - Always prefix aliases with the "alias_" prefix. - Always explicitly cast columns to the appropriate type as per the Data Dictionary using the `CAST` function before using them in calculations. - For date operations, use REGEXP_LIKE as Polars SQL does not have many date functions apart from `date_part` and `strftime` - Do not use SIMILAR TO and CROSS JOIN in the generated SQL query - Only use comments with the `--` prefix. - Do not use comments using the `/*` and `*/` syntax, as Polars SQL does not support it """ ### Additional DuckDB SQL Few-Shot Learning Examples when --fewshot-examples is set dd_fewshot_examples = """ ## Few-Shot Learning Examples Here are examples of common query patterns to follow: ### Example 1: Basic Aggregation **User Question:** "What is the kurtosis of a given numerical column?" **SQL Query:** ```sql -- {{ generated_by_signature }} -- Calculate kurtosis of a given numerical column SELECT kurtosis(column_name) as kurtosis_value FROM read_csv_auto({{ input_table_name }}); ``` ### Example 2: Group By with Aggregation **User Question:** "Show me the count by category" **SQL Query:** ```sql -- {{ generated_by_signature }} -- Group records by category and count occurrences SELECT category, COUNT(*) as count FROM read_csv_auto({{ input_table_name }}) GROUP BY category ORDER BY count DESC; ``` ### Example 3: Date/Time Analysis **User Question:** "What are the trends over time?" **SQL Query:** ```sql -- {{ generated_by_signature }} -- Analyze trends by date, assuming there's a date column SELECT DATE(date_column) as date, COUNT(*) as daily_count, AVG(numeric_column) as avg_value FROM read_csv_auto({{ input_table_name }}) WHERE date_column IS NOT NULL GROUP BY DATE(date_column) ORDER BY date; ``` ### Example 4: Top N Analysis **User Question:** "What are the top 10 most frequent values?" **SQL Query:** ```sql -- {{ generated_by_signature }} -- Find top 10 most frequent values in a column SELECT column_name, COUNT(*) as frequency FROM read_csv_auto({{ input_table_name }}) WHERE column_name IS NOT NULL GROUP BY column_name ORDER BY frequency DESC LIMIT 10; ``` ### Example 5: Conditional Analysis **User Question:** "How many records meet specific criteria?" **SQL Query:** ```sql -- {{ generated_by_signature }} -- Count records that meet specific conditions SELECT COUNT(*) as matching_records, COUNT(*) * 100.0 / (SELECT COUNT(*) FROM read_csv_auto({{ input_table_name }})) as percentage FROM read_csv_auto({{ input_table_name }}) WHERE condition1 = 'value1' AND condition2 > 100; ``` **User Question:** "What's the relationship between two columns?" **SQL Query:** ```sql -- {{ generated_by_signature }} -- Calculate relationship between two columns SELECT corr(column2, column1) as correlation, covar_pop(column2, column1) as covariance_population, covar_samp(column2, column1) as covariance_sample, regr_slope(column2, column1) as regression_slope, regr_intercept(column2, column1) as regression_intercept, regr_r2(column2, column1) as regression_r_squared, regr_count(column2, column1) as regression_count FROM read_csv_auto({{ input_table_name }}); ``` ### Example 7: Complex Filtering with Subqueries **User Question:** "Show records above the average" **SQL Query:** ```sql -- {{ generated_by_signature }} -- Find records above the average value SELECT * FROM read_csv_auto({{ input_table_name }}) WHERE numeric_column > ( SELECT AVG(numeric_column) FROM read_csv_auto({{ input_table_name }}) WHERE numeric_column IS NOT NULL ) ORDER BY numeric_column DESC; ``` ### Example 8: Window Functions for Ranking **User Question:** "Rank the data by some criteria" **SQL Query:** ```sql -- {{ generated_by_signature }} -- Rank records using window functions SELECT *, ROW_NUMBER() OVER (ORDER BY numeric_column DESC) as rank, RANK() OVER (ORDER BY numeric_column DESC) as rank_with_ties FROM read_csv_auto({{ input_table_name }}) ORDER BY rank; ``` ### Best Practices: - Always use `read_csv_auto({{ input_table_name }})` to read the CSV - Include meaningful column aliases (e.g., `COUNT(*) as total_count`) - Add WHERE clauses to filter out NULL values when appropriate - Use ORDER BY for meaningful result ordering - Include LIMIT for large result sets - Add comments explaining the query logic - Use appropriate aggregation functions (COUNT, SUM, AVG, MIN, MAX) - Consider using CTEs (WITH clauses) for complex queries - Use CASE statements for conditional logic - Leverage window functions for ranking and running totals""" ### Additional Polars SQL Few-Shot Learning Examples when --fewshot-examples is set p_fewshot_examples = """ ## Few-Shot Learning Examples Polars SQL is a SQL dialect patterned after PostgreSQL's syntax, but with Polars-specific functions and operators. Here are examples of common Polars SQL query patterns to follow: ### Example 1: Basic Aggregation **User Question:** "What is the continuous quantile element of a given numerical column?" **Polars SQL Query:** ```sql -- {{ generated_by_signature }} -- Calculate kurtosis of a given numerical column SELECT QUANTILE_CONT(column_name, 0.30) as column_name_q30 FROM {{ input_table_name }}; ``` ### Example 2: Group By with Aggregation **User Question:** "Show me the count by category" **Polars SQL Query:** ```sql -- {{ generated_by_signature }} -- Group records by category and count occurrences SELECT category, COUNT(*) as count FROM {{ input_table_name }} GROUP BY category ORDER BY count DESC; ``` ### Example 3: Date/Time Analysis **User Question:** "What are the trends over time?" **Polars SQL Query:** ```sql -- {{ generated_by_signature }} -- Analyze trends by date, assuming there's a date column SELECT date_column, COUNT(*) as daily_count, AVG(numeric_column) as avg_value FROM {{ input_table_name }} WHERE date_column IS NOT NULL GROUP BY date_part('day', date_column) ORDER BY date_column; ``` ### Example 4: Top N Analysis **User Question:** "What are the top 10 most frequent values?" **Polars SQL Query:** ```sql -- {{ generated_by_signature }} -- Find top 10 most frequent values in a column SELECT column_name, COUNT(*) as frequency FROM {{ input_table_name }} WHERE column_name IS NOT NULL GROUP BY column_name ORDER BY frequency DESC LIMIT 10; ``` ### Example 5: Conditional Analysis **User Question:** "How many records meet specific criteria?" **Polars SQL Query:** ```sql -- {{ generated_by_signature }} -- Count records that meet specific conditions SELECT COUNT(*) as matching_records, COUNT(*) * 100.0 / (SELECT COUNT(*) FROM {{ input_table_name }}) as percentage FROM {{ input_table_name }} WHERE condition1 = 'value1' AND condition2 > 100; ``` ### Example 6: Cross-Column Analysis **User Question:** "What's the relationship between two columns?" **Polars SQL Query:** ```sql -- {{ generated_by_signature }} -- Calculate relationship between two columns SELECT corr(column1, column2) as pearson_correlation, covar(column1, column2) as covariance FROM {{ input_table_name }}; ``` ### Example 7: Complex Filtering with Subqueries **User Question:** "Show records above the average" **Polars SQL Query:** ```sql -- {{ generated_by_signature }} -- Find records above the average value SELECT * FROM {{ input_table_name }} WHERE numeric_column > ( SELECT AVG(numeric_column) FROM {{ input_table_name }} WHERE numeric_column IS NOT NULL ) ORDER BY numeric_column DESC; ``` ### Example 8: Window Functions for Ranking **User Question:** "Rank the data by some criteria" **Polars SQL Query:** ```sql -- {{ generated_by_signature }} -- Rank records using window functions SELECT *, ROW_NUMBER() OVER (ORDER BY numeric_column DESC) as rank, RANK() OVER (ORDER BY numeric_column DESC) as rank_with_ties FROM {{ input_table_name }} ORDER BY rank; ``` ### Example 9: String Operations **User Question:** "Extract patterns from text columns" **Polars SQL Query:** ```sql -- {{ generated_by_signature }} -- Extract patterns and analyze string data SELECT text_column, LENGTH(text_column) as text_length, UPPER(text_column) as uppercase_text, LOWER(text_column) as lowercase_text, SUBSTR(text_column, 1, 10) as first_10_chars, TIMESTAMP(text_column, 'yyyy-MM-dd HH:mm:ss') as timestamp_column, DATE(text_column, 'yyyy-MM-dd') as date_column, REGEXP_LIKE(text_column, 'regex_pattern') as is_regex_pattern_present, STRPOS(text_column, 'substring') as substring_position, STARTS_WITH(text_column, 'prefix') as starts_with_prefix, ENDS_WITH(text_column, 'suffix') as ends_with_suffix, REPLACE(text_column, 'old_substring', 'new_substring') as replaced_text, CONCAT(text_column, 'concatenated_text') as concatenated_text, TRIM(text_column) as trimmed_text, LTRIM(text_column) as ltrimmed_text, RTRIM(text_column) as rtrimmed_text FROM {{ input_table_name }} WHERE text_column IS NOT NULL; ``` ### Example 10: Array Operations **User Question:** "Work with array columns" **Polars SQL Query:** ```sql -- {{ generated_by_signature }} -- Analyze array column data SELECT array_column, ARRAY_AGG(array_column ORDER BY array_column) as array_agg_sorted, ARRAY_CONTAINS(array_column, 'value') as contains_value, ARRAY_GET(array_column, 1) as array_col_at_1, ARRAY_LENGTH(array_column) as array_length, ARRAY_MAX(array_column) as max_value, ARRAY_MIN(array_column) as min_value FROM {{ input_table_name }} WHERE array_column IS NOT NULL; ``` ### Example 11: Struct Operations **User Question:** "Access nested struct fields" **Polars SQL Query:** ```sql -- {{ generated_by_signature }} -- Extract and analyze nested struct data SELECT struct_column.field1 as extracted_field1, struct_column.field2 as extracted_field2, struct_column.nested.field3 as nested_field3 FROM {{ input_table_name }} WHERE struct_column IS NOT NULL; ``` ### Example 12: Pivot Operations **User Question:** "Create a pivot table" **Polars SQL Query:** ```sql -- {{ generated_by_signature }} -- Create a pivot table from the data SELECT category, SUM(CASE WHEN status = 'active' THEN value ELSE 0 END) as active_sum, SUM(CASE WHEN status = 'inactive' THEN value ELSE 0 END) as inactive_sum, COUNT(CASE WHEN status = 'active' THEN 1 END) as active_count FROM {{ input_table_name }} GROUP BY category; ``` ### Best Practices for Polars SQL: - Always use `{{ input_table_name }}` as the placeholder for the table name - Include meaningful column aliases (e.g., `COUNT(*) as total_count`) - Add WHERE clauses to filter out NULL values when appropriate - Use ORDER BY for meaningful result ordering - Include LIMIT for large result sets - Add comments explaining the query logic - Use appropriate aggregation functions (COUNT, SUM, AVG, MIN, MAX, kurtosis, skewness) - Use CASE statements for conditional logic and pivoting - Leverage window functions for ranking and running totals - Take advantage of Polars' optimized array operations - Use `date_part` for time-based grouping instead of `DATE()` function - Consider using CTEs (WITH clauses) for complex queries - Leverage Polars' built-in statistical functions for data analysis"""