Query Parser Example Notebook

# Read queries from a json file. Note that this is for demo only
with open("test/queries.json", "r") as file:
queries = json.load(file)
# Scan the catalog from a file. Note that this is for demo only.
# Use dbcat (https://github.com/tokern/dbcat) to load the catalog from a database
# or a data warehouse
from dbcat.scanners.json import File
catalog_objects = File("test", "test/catalog.json").scan()
# Load the catalog. For more details, refer to https://github.com/tokern/dbcat
# Provide credentials of the postgres database where catalog is stored.
# Note that you should have already created the database and user
from data_lineage import catalog_connection
catalog_conf = """
catalog:
type: postgres
user: catalog_user
password: catal0g_passw0rd
host: 127.0.0.1
port: 5432
database: tokern
"""
catalog = catalog_connection(catalog_conf)
catalog.save_catalog(catalog_objects)
from data_lineage.parser import parse_queries, visit_dml_queries
# Parse all queries
parsed = parse_queries(queries)
# Visit the parse trees to extract source and target queries
visited = visit_dml_queries(catalog, parsed)
# Create a graph and visualize it
from data_lineage.parser import create_graph
graph = create_graph(catalog, visited)
import plotly
plotly.offline.iplot(graph.fig())

# Visualize a sub graph
target = catalog.get_column(database_name="test",
schema_name="default",
table_name="normalized_pagecounts",
column_name="page_title")
sub_graph = graph.sub_graph(target)
sub_fig = sub_graph.fig()
plotly.offline.iplot(sub_fig)