microsoft · hmoazam · Dec 5, 2022 · Dec 6, 2022 · Dec 12, 2022 · Dec 12, 2022
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -29,7 +29,7 @@ If applicable, add screenshots to help explain your problem.
 **Desktop (please complete the following information):**
  - OS: [e.g. Windows, Mac]
  - OpenLineage Version: [e.g. name of jar]
- - Databricks Runtime Version: [e.g. 6.4, 9.1, 10.1]
+ - Databricks Runtime Version: [e.g. 9.1, 10.1, 11.3]
  - Cluster Type: [e.g. Job, Interactive]
  - Cluster Mode: [e.g. Standard, High Concurrency, Single]
  - Using Credential Passthrough: [e.g. Yes, No]

diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml
@@ -9,6 +9,11 @@ on:
       - '**.csproj'
       - 'tests/integration/**'
   workflow_dispatch:
+    inputs:
+      tags:
+        description: 'Flag as workflow dispatch'
+        required: true
+        type: boolean
 
 env:
   DOTNET_VERSION: '6.x.x' # The .NET SDK version to use
@@ -25,11 +30,13 @@ jobs:
         echo "Github Event Name: ${{ github.event_name }}" 
         echo "Github Ref: ${{ github.ref }}"
         echo "Github Ref Type: ${{ github.ref_type }}"
+        echo "Github Tags: ${{ inputs.tags }}"
 
   build:
     if: |
       github.event_name == 'pull_request' ||
-      (github.event_name == 'create' && github.ref_type == 'tag')
+      (github.event_name == 'create' && github.ref_type == 'tag') ||
+      ${{github.event_name == 'create' && inputs.tags}}
     name: build-${{matrix.os}}
     runs-on: ${{ matrix.os }}
     strategy:
@@ -69,6 +76,18 @@ jobs:
         name: FunctionZip
         path: ~/artifact/FunctionZip.zip
 
+    - name: Create One Line OlToPurviewMappings
+      run: | 
+        mkdir ~/artifact-mappings
+        python ./deployment/util/mappings-remove-spaces.py ./deployment/infra/OlToPurviewMappings.json > ~/artifact-mappings/one-line-mappings.json
+        ls ~/artifact-mappings
+
+    - name: Upload One Line OlToPurviewMappings Build Artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: Mappings
+        path: ~/artifact-mappings/one-line-mappings.json
+
   runIntegrationTests:
     name: Test on Integration Tests
     needs: [build]
@@ -85,18 +104,19 @@ jobs:
         name: FunctionZip
         path: ./artifacts
 
-    - name: Azure Functions Action
+    - name: Deploy Azure Function to Integration Env
       uses: Azure/functions-action@v1.4.6
       with:
         app-name: ${{ secrets.INT_FUNC_NAME }}
         package: ./artifacts/FunctionZip.zip
         publish-profile: ${{ secrets.INT_PUBLISH_PROFILE }}
 
-    - uses: azure/login@v1
+    - name: Azure Login
+      uses: azure/login@v1
       with:
         creds: ${{ secrets.INT_AZ_CLI_CREDENTIALS }}
 
-    - name: Azure CLI script
+    - name: Compare and Update App Settings on Deployed Function
       uses: azure/CLI@v1
       with:
         azcliversion: 2.34.1
@@ -108,7 +128,7 @@ jobs:
 
     # Start up Synapse Pool and Execute Tests
     - name: Start Integration Synapse SQL Pool
-      run: source tests/integration/manage-sql-pool.sh start ${{ secrets.INT_SUBSCRIPTION_ID }} ${{ secrets.INT_RG_NAME }} ${{ secrets.INT_SYNAPSE_WKSP_NAME }} ${{ secrets.INT_SYNAPSE_SQLPOOL_NAME }}
+      run: source tests/integration/manage-sql-pool.sh start ${{ secrets.INT_SUBSCRIPTION_ID }} ${{ secrets.INT_SYNAPSE_SQLPOOL_RG_NAME }} ${{ secrets.INT_SYNAPSE_WKSP_NAME }} ${{ secrets.INT_SYNAPSE_SQLPOOL_NAME }}
       env:
         AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
         AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }}
@@ -124,6 +144,10 @@ jobs:
         token = ${{ secrets.INT_DATABRICKS_ACCESS_TOKEN }}" > ./config.ini
         export DATABRICKS_CONFIG_FILE=./config.ini
 
+    - name: Confirm Databricks CLI is configured
+      run: databricks clusters spark-versions
+      env:
+        DATABRICKS_CONFIG_FILE: ./config.ini
 
     - name: Cleanup Integration Environment
       run: python ./tests/integration/runner.py --cleanup --dontwait None None None
@@ -144,7 +168,7 @@ jobs:
         DATABRICKS_CONFIG_FILE: ./config.ini
 
     - name: Stop Integration Synapse SQL Pool
-      run: source tests/integration/manage-sql-pool.sh stop ${{ secrets.INT_SUBSCRIPTION_ID }} ${{ secrets.INT_RG_NAME }} ${{ secrets.INT_SYNAPSE_WKSP_NAME }} ${{ secrets.INT_SYNAPSE_SQLPOOL_NAME }}
+      run: source tests/integration/manage-sql-pool.sh stop ${{ secrets.INT_SUBSCRIPTION_ID }} ${{ secrets.INT_SYNAPSE_SQLPOOL_RG_NAME }} ${{ secrets.INT_SYNAPSE_WKSP_NAME }} ${{ secrets.INT_SYNAPSE_SQLPOOL_NAME }}
       env:
         AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
         AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }}
@@ -172,25 +196,3 @@ jobs:
       with:
         artifacts: ~/artifacts/FunctionZip.zip
         token: ${{ secrets.GITHUB_TOKEN }}
-
-  deployProductionEnvironment:
-    name: Release to Production Environment
-    needs: [createRelease]
-    runs-on: ubuntu-latest
-    environment: 
-      name: Production
-    steps:
-    - uses: actions/checkout@v3
-
-    - name: Download Artifact
-      uses: actions/download-artifact@v3
-      with:
-        name: FunctionZip
-        path: ./artifacts
-
-    - name: Azure Functions Action
-      uses: Azure/functions-action@v1.4.6
-      with:
-        app-name: ${{ secrets.FUNC_NAME }}
-        package: ./artifacts/FunctionZip.zip
-        publish-profile: ${{ secrets.PUBLISH_PROFILE }}
diff --git a/.gitignore b/.gitignore
@@ -161,3 +161,4 @@ build
 
 # Ignore local settings
 localsettingsdutils.py
+*.ini
diff --git a/LIMITATIONS.md b/LIMITATIONS.md
@@ -10,7 +10,16 @@ The solution accelerator supports a limited set of data sources to be ingested i
 * [Azure Synapse SQL Pools](#azure-synapse-sql-pools)
 * [Azure SQL DB](#azure-sql-db)
 * [Delta Lake](#delta-lake-file-format)
+<<<<<<< HEAD
+* [Azure MySQL](#azure-mysql)
+* [PostgreSQL](#postgresql)
+* [Azure Data Explorer](#azure-data-explorer)
+* [Azure Cosmos DB]()
+=======
+* [Azure Cosmos DB](#azure-cosmos-db)
+>>>>>>> 85ddab3 (Update LIMITATIONS)
 * [Other Data Sources and Limitations](#other-data-sources-and-limitations)
+* [Column Level Mapping Supported Sources](#column-level-mapping-supported-sources)
 
 ## Connecting to Assets in Purview
 
@@ -67,10 +76,43 @@ Supports Azure SQL DB through the [Apache Spark Connector for Azure SQL DB](http
 
 Supports [Delta File Format](https://delta.io/).
 
+* Does NOT support MERGE INTO statement on Databricks due to differences in Databricsk and Open Source classes.
+  * An earlier release mistakenly indicated support 
 * Does not support Delta on Spark 2 Databricks Runtimes.
-* Does not currently support the MERGE INTO statement due to differences between proprietary Databricks and Open Source Delta implementations.
 * Commands such as [Vacuum](https://docs.delta.io/latest/delta-utility.html#toc-entry-1) or [Optimize](https://docs.microsoft.com/en-us/azure/databricks/spark/latest/spark-sql/language-manual/delta-optimize) do not emit any lineage information and will not result in a Purview asset.
 
+## Azure MySQL
+
+Supports Azure MySQL through [JDBC](https://learn.microsoft.com/en-us/azure/databricks/external-data/jdbc).
+
+## PostgreSQL
+
+Supports both Azure PostgreSQL and on-prem/VM installations of PostgreSQL through [JDBC](https://learn.microsoft.com/en-us/azure/databricks/external-data/jdbc).
+
+* If you specify the `dbTable` value without the database schema (e.g. `dbo`), the connector assumes you are using the default `public` schema.
+  * For users and Service Principals with different default schemas, this may result in incorrect lineage.
+  * This can be corrected by specifying the database schema in the Spark job.
+* Default configuration supports using multiple strings divided by dots to define a custom schema.  For example ```myschema.mytable```.
+* If you register and scan your postgres server as `localhost` in Microsoft Purview, but use the IP within the Databricks notebook, the assets will not be matched correctly. You need to use the IP when registering the Postgres server.  
+
+## Azure Data Explorer
+
+Supports Azure Data Explorer (aka Kusto) through the [Azure Data Explorer Connector for Apache Spark](https://learn.microsoft.com/en-us/azure/data-explorer/spark-connector).
+
+* Only supports the `kustoTable` option.
+* If you use the `kustoQuery` option, it will return a Purview Generic Connector entity with a name of `COMPLEX` to capture the lineage but we are not able to parse arbitrary kusto queries at this time.
+
+## Azure Data Factory
+
+Supports capturing lineage for Databricks Notebook activities in Azure Data Factory (ADF). After running a notebook through ADF on an interactive or job cluster, you will see a Databricks Job asset in Microsoft Purview with a name similar to `ADF_<factory name>_<pipeline name>`. For each Databricks notebook activity, you will also see a Databricks Task with a name similar to `ADF_<factory name>_<pipeline name>_<activity name>`.
+
+* At this time, the Microsoft Purview view of Azure Data Factory lineage will not contain these tasks unless the Databricks Task uses or feeds a data source to a Data Flow or Copy activity.
+* Copy Activities may not show lineage connecting to these Databricks tasks since it emits individual file assets rather than folder or resource set assets.
+
+## Azure Cosmos DB
+
+Supports querying [Azure Cosmos DB (SQL API)](https://github.com/Azure/azure-sdk-for-java/tree/main/sdk/cosmos/azure-cosmos-spark_3_2-12)
+
 ## Other Data Sources and Limitations
 
 ### Lineage for Unsupported Data Sources
@@ -87,14 +129,6 @@ Microsoft Purview's Fully Qualified Names are case sensitive. Spark Jobs may hav
 
 As a result, this solution attempts to find the best matching *existing* asset. If no existing asset is found to match based on qualified name, the data source name as found in the Spark query will be used toe create a dummy asset. On a subsequent scan of the data source in Purview and another run of the Spark query with the connector enabled will resolve the linkage.
 
-### Column Level Mapping
-
-The solution currently does not provide column level mapping within the Microsoft Purview lineage tab.
-
-### Data Factory
-
-The solution currently reflects the unfriendly job name provided by Data Factory to Databricks as noted in [issue 72](https://github.com/microsoft/Purview-ADB-Lineage-Solution-Accelerator/issues/72#issuecomment-1211202405). You will see jobs with names similar to `ADF_<factory name>_<pipeline name>_<activity name>_<guid>`.
-
 ### Hive Metastore / Delta Table Names
 
 The solution currently does not support emitting the Hive Metastore / Delta table SQL names. For example, if you have a Delta table name `default.events` and it's physical location is `abfss://container@storage/path`, the solution will report `abfss://container@storage/path`.
@@ -115,8 +149,22 @@ The solution supports Spark 2 job cluster jobs. Databricks has removed Spark 2 f
 
 ### Spark 3.3+ Support
 
-The solution supports Spark 3.0, 3.1, and 3.2 interactive and job clusters. We are working with the OpenLineage community to enable support of Spark 3.3 on Databricks Runtime 11.0 and higher.
+The solution supports Spark 3.0, 3.1, 3.2, and 3.3 interactive and job clusters. The solution has been tested on the Databricks Runtime 11.3LTS version.
 
 ### Private Endpoints on Microsoft Purview
 
 Currently, the solution does not support pushing lineage to a Private Endpoint backed Microsoft Purview service. The solution may be customized to deploy the Azure Function to connect to Microsoft Purview. Consider reviewing the documentation to [Connect privately and securely to your Microsoft Purview account](https://docs.microsoft.com/en-us/azure/purview/catalog-private-link-account-portal).
+
+## Column Level Mapping Supported Sources
+
+Starting with OpenLineage 0.18.0 and release 2.3.0 of the solution accelerator, we support emitting column level mapping from the following sources and their combinations:
+
+* Read / Write to ABFSS file paths (mount or explicit path `abfss://`)
+* Read / Write to WASBS file paths (mount or explicit path `wasbs://`)
+* Read / Write to the default metastore in Azure Databricks
+  * Does NOT support custom hive metastores
+
+### Column Mapping Support for Delta Format
+
+* Delta Merge statements are not supported at this time
+* Delta to Delta is NOT supported at this time
diff --git a/README.md b/README.md
@@ -48,13 +48,19 @@ Gathering lineage data is performed in the following steps:
 
 * Supports table level lineage from Spark Notebooks and jobs for the following data sources:
   * Azure SQL
-  * Azure Synapse Analytics
+  * Azure Synapse Analytics (as input)
   * Azure Data Lake Gen 2
   * Azure Blob Storage
-  * Delta Lake
-* Supports Spark 3.0, 3.1, and 3.2 (Interactive and Job clusters) / Spark 2.x (Job clusters)
-  * Databricks Runtimes between 6.4 and 10.4 are currently supported
-* Can be configured per cluster or for all clusters as a global configuration  
+  * Delta Lake (Merge command not supported)
+  * Azure Data Explorer
+  * Azure Data Factory orchestration
+  * Hive Tables (in default metastore)
+  * MySQL
+  * PostgreSQL
+* Supports Spark 3.0, 3.1, 3.2, and 3.3 (Interactive and Job clusters) / Spark 2.x (Job clusters)
+  * Databricks Runtimes between 9.1 and 11.3 LTS are currently supported
+* Can be configured per cluster or for all clusters as a global configuration 
+* Support **column level lineage** for ABFSS, WASBS, and default metastore hive tables (see [Limitations](./LIMITATIONS.md#column-level-mapping-supported-sources) for more detail)
 * Once configured, <span style="color: red;">**does not require any code changes to notebooks or jobs**</span>
 * Can [add new source support through configuration](./docs/extending-source-support.md)  
 
@@ -92,26 +98,24 @@ There are two deployment options for this solution accelerator:
 1. Once complete, open your Purview workspace and click the "Browse assets" button near the center of the page
 
 1. Click on the "By source type" tab  
-You should see several items listed under the heading of "Custom source types".  There will be a Databricks section and possibly a Purview Custom Connector section under this heading
+You should see at least one item listed under the heading of "Azure Databricks".  In addition there will possibly be a Purview Custom Connector section under the Custom source types heading
 
     ![browse_assets.png](./assets/img/readme/browse_assets.png)
 
-1. Click on the "Databricks" section, then click on the "Databricks Notebook" tile which corresponds to the notebook you ran. In the Properties or Related tabs select one of the "Notebook Tasks" which represent a task in a Databricks job. From the "Databricks Notebook Task", you may see the lineage of one or many of the different spark actions in the notebook.  This application may have a number of "Databricks Processes" linked under it which represent the data lineage.  To see these, see the Properties or  Related tabs
+1. Click on the "Databricks" section, then click on the link to the Azure Databricks workspace which the sample notebook was ran. Then select the notebook which you ran (for those running Databricks Jobs, you can also select the job and drill into the related tasks) 
+    * After running a Databricks Notebook on an Interactive Cluster, you will see lineage directly in the Notebook asset under the Lineage tab.
+    * After running a Databricks Job on a Job Cluster, you will see lineage in the Notebook Task asset. To navigate from a Notebook to a Notebook Task select the Properties tab and choose the Notebook Tasks from the Related Assets section. Please note that Databricks Jobs lineage require [additional setup](./deploy-base.md#support-extracting-lineage-from-databricks-jobs) outside of the demo deployment.
 
     ![databricks_task_related.png](./assets/img/readme/databricks_task_related.png)
 
-1. From the Related view, click on the processes icon, then click on one of the links representing the associated process objects
-
-1. Click on the properties tab to view the properties associated with the process.  Note that the full Spark Plan is included
-
-    ![spark_plan.png](./assets/img/readme/spark_plan.png)
-
 1. Click to the lineage view to see the lineage graph
 
     ![lineage_view.png](./assets/img/readme/lineage_view.png)
 
     **Note**: If you are viewing the Databricks Process shortly after it was created, sometimes the lineage tab takes some time to display. If you do not see the lineage tab, wait a few minutes and then refresh the browser.
 
+    **Lineage Note**: The screenshot above shows lineage to an Azure Data Lake Gen 2 folder, you must have scanned your Data Lake prior to running a notebook for it to be able to match to a Microsoft Purview built-in type like folders or resource sets.
+
 ## Troubleshooting
 
 **When filing a new issue, [please include associated log message(s) from Azure Functions](./TROUBLESHOOTING.md#debug-logs).** This will allow the core team to debug within our test environment to validate the issue and develop a solution.

diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md
@@ -48,6 +48,57 @@
 
     In this case, use the databricks CLI to upload the jar to the expected location to avoid changes in the file name.
 
+* ### Internal Error Resolving Secrets
+
+    For the demo deployment, if your cluster fails and returns the error "Internal Error resolving secrets" and "Failed to fetch secrets referred to in Spark Conf", the deployment script may have failed to add an Access Policy to the Azure Key Vault or the secret scope was not created.
+
+    **Solution**: Update the values in the below script and execute it in the cloud shell. This script deletes the demo deployment's secret scope and then recreates it. After executing the script, you should see an access policy for "AzureDatabricks" in your Azure Key Vault.
+
+    ```bash
+    adb_ws_url=adb-DATABRICKS_WORKSPACE.ID.azuredatabricks.net
+    global_adb_token=$(az account get-access-token --resource 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d -o tsv --query '[accessToken]')
+    adb_ws_id=/subscriptions/SUBSCRIPTION_ID/resourceGroups/RESOURCE_GROUP_NAME/providers/Microsoft.Databricks/workspaces/DATABRICKS_WORKSPACE_NAME
+    subscription_id=123acb-456-def
+    akv_name=AKV_NAME
+    akv_resource_id=/subscriptions/SUBSCRIPTION_ID/resourceGroups/RESOURCE_GROUP_NAME/providers/Microsoft.KeyVault/vaults/AKV_NAME
+
+    # Remove the Secret Scope if it exists
+    cat << EOF > delete-scope.json
+    {
+    "scope": "purview-to-adb-kv"
+    }
+    EOF
+
+    curl \
+    -X POST https://$adb_ws_url/api/2.0/secrets/scopes/delete \
+    -H "Authorization: Bearer $global_adb_token" \
+    -H "X-Databricks-Azure-Workspace-Resource-Id: $adb_ws_id" \
+    --data @delete-scope.json
+
+    # If the above fails, that's okay
+    # Ultimately, we just need a clean slate
+
+    cat << EOF > create-scope.json
+    {
+    "scope": "purview-to-adb-kv",
+    "scope_backend_type": "AZURE_KEYVAULT",
+    "backend_azure_keyvault":
+    {
+        "resource_id": "$akv_resource_id",
+        "dns_name": "https://$akv_name.vault.azure.net/"
+    },
+    "initial_manage_principal": "users"
+    }
+    EOF
+
+
+    curl \
+    -X POST https://$adb_ws_url/api/2.0/secrets/scopes/create \
+    -H "Authorization: Bearer $global_adb_token" \
+    -H "X-Databricks-Azure-Workspace-Resource-Id: $adb_ws_id" \
+    --data @create-scope.json
+    ```
+
 ## <a id="no-lineage" />I don't see lineage in Microsoft Purview
 
 * ### Try Refreshing the Page
@@ -89,10 +140,10 @@ When reviewing the Driver logs, you see an error in the Log4j output that indica
 * Confirm that `spark.openlineage.version` is set correctly.
 
     |SA Release|OpenLineage Jar|spark.openlineage.version|
-    |----|----|----|
-    |1.0.0|0.8.2|1
-    |1.1.0|0.8.2|1
-    |2.0.0|0.11.0|v1
+    |-----|----|----|
+    |1.0.x|0.8.2|1|
+    |1.1.x|0.8.2|1|
+    |2.x.x or newer|0.11.0 or newer|v1|
 
 ## <a id="pureviewout-load2purview" />PurviewOut Logs: Error Loading to Purview: 403 Forbidden
 

diff --git a/assets/img/readme/browse_assets.png b/assets/img/readme/browse_assets.png
diff --git a/assets/img/readme/databricks_task_related.png b/assets/img/readme/databricks_task_related.png
diff --git a/assets/img/readme/lineage.png b/assets/img/readme/lineage.png
diff --git a/assets/img/readme/lineage_view.png b/assets/img/readme/lineage_view.png
Original file line number	Diff line number	Diff line change
Expand Up		@@ -161,3 +161,4 @@ build

		# Ignore local settings
		localsettingsdutils.py
		*.ini