From e41942e2736d3562e5286613ebb0bcadb94ac3ef Mon Sep 17 00:00:00 2001 From: Dhruv Pratap Date: Mon, 26 Jun 2023 18:32:07 -0400 Subject: [PATCH 1/5] PySparkler: Sqlfluff upgrades a de-templated SQL string. --- .gitignore | 2 + pysparkler/poetry.lock | 337 +++++++++++++++++++++++++- pysparkler/pyproject.toml | 6 + pysparkler/pysparkler/sql_21_to_33.py | 91 +++++++ pysparkler/tests/test_sql_21_to_33.py | 39 +++ 5 files changed, 468 insertions(+), 7 deletions(-) create mode 100644 pysparkler/pysparkler/sql_21_to_33.py create mode 100644 pysparkler/tests/test_sql_21_to_33.py diff --git a/.gitignore b/.gitignore index 631512d..8699be7 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,8 @@ pipelinecompare/metastore_db/ **/dist/* **/__pycache__/* .DS_Store +**/build/* +**/*.egg-info/* # Annoying macOS Ignores **.DS_Store diff --git a/pysparkler/poetry.lock b/pysparkler/poetry.lock index e9eb9a0..d85e184 100644 --- a/pysparkler/poetry.lock +++ b/pysparkler/poetry.lock @@ -1,5 +1,17 @@ # This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +[[package]] +name = "appdirs" +version = "1.4.4" +description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"}, + {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"}, +] + [[package]] name = "attrs" version = "23.1.0" @@ -31,6 +43,18 @@ files = [ {file = "cfgv-3.3.1.tar.gz", hash = "sha256:f5a830efb9ce7a445376bb66ec94c638a9787422f96264c98edc6bdeed8ab736"}, ] +[[package]] +name = "chardet" +version = "5.1.0" +description = "Universal encoding detector for Python 3" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "chardet-5.1.0-py3-none-any.whl", hash = "sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9"}, + {file = "chardet-5.1.0.tar.gz", hash = "sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5"}, +] + [[package]] name = "click" version = "8.1.3" @@ -58,6 +82,27 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "diff-cover" +version = "7.6.0" +description = "Run coverage and linting reports on diffs" +category = "main" +optional = false +python-versions = ">=3.7.2,<4.0.0" +files = [ + {file = "diff_cover-7.6.0-py3-none-any.whl", hash = "sha256:e6e44554d6adb053f4cb292222e4cb772ef7d5a5690cdb51fe806f51f45dd690"}, + {file = "diff_cover-7.6.0.tar.gz", hash = "sha256:d1007303f6bfba14925f602fc735ca8d6b254d8b57320f2b4b38a3a618453f40"}, +] + +[package.dependencies] +chardet = ">=3.0.0" +Jinja2 = ">=2.7.1" +pluggy = ">=0.13.1,<2" +Pygments = ">=2.9.0,<3.0.0" + +[package.extras] +toml = ["tomli (>=1.2.1)"] + [[package]] name = "distlib" version = "0.3.6" @@ -74,7 +119,7 @@ files = [ name = "exceptiongroup" version = "1.1.1" description = "Backport of PEP 654 (exception groups)" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -135,7 +180,7 @@ license = ["ukkonen"] name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -143,6 +188,24 @@ files = [ {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] +[[package]] +name = "jinja2" +version = "3.1.2" +description = "A very fast and expressive template engine." +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"}, + {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, +] + +[package.dependencies] +MarkupSafe = ">=2.0" + +[package.extras] +i18n = ["Babel (>=2.7)"] + [[package]] name = "jsonschema" version = "4.17.3" @@ -257,6 +320,66 @@ profiling = ["gprof2dot"] rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] +[[package]] +name = "markupsafe" +version = "2.1.3" +description = "Safely add untrusted strings to HTML/XML markup." +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "MarkupSafe-2.1.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd0f502fe016460680cd20aaa5a76d241d6f35a1c3350c474bac1273803893fa"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e09031c87a1e51556fdcb46e5bd4f59dfb743061cf93c4d6831bf894f125eb57"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68e78619a61ecf91e76aa3e6e8e33fc4894a2bebe93410754bd28fce0a8a4f9f"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:65c1a9bcdadc6c28eecee2c119465aebff8f7a584dd719facdd9e825ec61ab52"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:525808b8019e36eb524b8c68acdd63a37e75714eac50e988180b169d64480a00"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:962f82a3086483f5e5f64dbad880d31038b698494799b097bc59c2edf392fce6"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:aa7bd130efab1c280bed0f45501b7c8795f9fdbeb02e965371bbef3523627779"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c9c804664ebe8f83a211cace637506669e7890fec1b4195b505c214e50dd4eb7"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-win32.whl", hash = "sha256:10bbfe99883db80bdbaff2dcf681dfc6533a614f700da1287707e8a5d78a8431"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-win_amd64.whl", hash = "sha256:1577735524cdad32f9f694208aa75e422adba74f1baee7551620e43a3141f559"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ad9e82fb8f09ade1c3e1b996a6337afac2b8b9e365f926f5a61aacc71adc5b3c"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3c0fae6c3be832a0a0473ac912810b2877c8cb9d76ca48de1ed31e1c68386575"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b076b6226fb84157e3f7c971a47ff3a679d837cf338547532ab866c57930dbee"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfce63a9e7834b12b87c64d6b155fdd9b3b96191b6bd334bf37db7ff1fe457f2"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:338ae27d6b8745585f87218a3f23f1512dbf52c26c28e322dbe54bcede54ccb9"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e4dd52d80b8c83fdce44e12478ad2e85c64ea965e75d66dbeafb0a3e77308fcc"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:df0be2b576a7abbf737b1575f048c23fb1d769f267ec4358296f31c2479db8f9"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca379055a47383d02a5400cb0d110cef0a776fc644cda797db0c5696cfd7e18e"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:b7ff0f54cb4ff66dd38bebd335a38e2c22c41a8ee45aa608efc890ac3e3931bc"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c011a4149cfbcf9f03994ec2edffcb8b1dc2d2aede7ca243746df97a5d41ce48"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:56d9f2ecac662ca1611d183feb03a3fa4406469dafe241673d521dd5ae92a155"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-win32.whl", hash = "sha256:8758846a7e80910096950b67071243da3e5a20ed2546e6392603c096778d48e0"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-win_amd64.whl", hash = "sha256:787003c0ddb00500e49a10f2844fac87aa6ce977b90b0feaaf9de23c22508b24"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:2ef12179d3a291be237280175b542c07a36e7f60718296278d8593d21ca937d4"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2c1b19b3aaacc6e57b7e25710ff571c24d6c3613a45e905b1fde04d691b98ee0"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8afafd99945ead6e075b973fefa56379c5b5c53fd8937dad92c662da5d8fd5ee"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c41976a29d078bb235fea9b2ecd3da465df42a562910f9022f1a03107bd02be"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d080e0a5eb2529460b30190fcfcc4199bd7f827663f858a226a81bc27beaa97e"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:69c0f17e9f5a7afdf2cc9fb2d1ce6aabdb3bafb7f38017c0b77862bcec2bbad8"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:504b320cd4b7eff6f968eddf81127112db685e81f7e36e75f9f84f0df46041c3"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:42de32b22b6b804f42c5d98be4f7e5e977ecdd9ee9b660fda1a3edf03b11792d"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-win32.whl", hash = "sha256:ceb01949af7121f9fc39f7d27f91be8546f3fb112c608bc4029aef0bab86a2a5"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-win_amd64.whl", hash = "sha256:1b40069d487e7edb2676d3fbdb2b0829ffa2cd63a2ec26c4938b2d34391b4ecc"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8023faf4e01efadfa183e863fefde0046de576c6f14659e8782065bcece22198"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6b2b56950d93e41f33b4223ead100ea0fe11f8e6ee5f641eb753ce4b77a7042b"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9dcdfd0eaf283af041973bff14a2e143b8bd64e069f4c383416ecd79a81aab58"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:282c2cb35b5b673bbcadb33a585408104df04f14b2d9b01d4c345a3b92861c2c"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ab4a0df41e7c16a1392727727e7998a467472d0ad65f3ad5e6e765015df08636"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7ef3cb2ebbf91e330e3bb937efada0edd9003683db6b57bb108c4001f37a02ea"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0a4e4a1aff6c7ac4cd55792abf96c915634c2b97e3cc1c7129578aa68ebd754e"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-win32.whl", hash = "sha256:fec21693218efe39aa7f8599346e90c705afa52c5b31ae019b2e57e8f6542bb2"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-win_amd64.whl", hash = "sha256:3fd4abcb888d15a94f32b75d8fd18ee162ca0c064f35b11134be77050296d6ba"}, + {file = "MarkupSafe-2.1.3.tar.gz", hash = "sha256:af598ed32d6ae86f1b747b82783958b1a4ab8f617b06fe68795c7f026abbdcad"}, +] + [[package]] name = "mdurl" version = "0.1.2" @@ -322,7 +445,7 @@ setuptools = "*" name = "packaging" version = "23.1" description = "Core utilities for Python packages" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -330,6 +453,18 @@ files = [ {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] +[[package]] +name = "pathspec" +version = "0.11.1" +description = "Utility library for gitignore style pattern matching of file paths." +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pathspec-0.11.1-py3-none-any.whl", hash = "sha256:d8af70af76652554bd134c22b3e8a1cc46ed7d91edcdd721ef1a0c51a84a5293"}, + {file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"}, +] + [[package]] name = "platformdirs" version = "3.5.3" @@ -350,7 +485,7 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest- name = "pluggy" version = "1.0.0" description = "plugin and hook calling mechanisms for python" -category = "dev" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -437,7 +572,7 @@ files = [ name = "pytest" version = "7.3.2" description = "pytest: simple powerful testing with Python" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -530,6 +665,104 @@ files = [ {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"}, ] +[[package]] +name = "regex" +version = "2023.6.3" +description = "Alternative regular expression module, to replace re." +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "regex-2023.6.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:824bf3ac11001849aec3fa1d69abcb67aac3e150a933963fb12bda5151fe1bfd"}, + {file = "regex-2023.6.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:05ed27acdf4465c95826962528f9e8d41dbf9b1aa8531a387dee6ed215a3e9ef"}, + {file = "regex-2023.6.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b49c764f88a79160fa64f9a7b425620e87c9f46095ef9c9920542ab2495c8bc"}, + {file = "regex-2023.6.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8e3f1316c2293e5469f8f09dc2d76efb6c3982d3da91ba95061a7e69489a14ef"}, + {file = "regex-2023.6.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:43e1dd9d12df9004246bacb79a0e5886b3b6071b32e41f83b0acbf293f820ee8"}, + {file = "regex-2023.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4959e8bcbfda5146477d21c3a8ad81b185cd252f3d0d6e4724a5ef11c012fb06"}, + {file = "regex-2023.6.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:af4dd387354dc83a3bff67127a124c21116feb0d2ef536805c454721c5d7993d"}, + {file = "regex-2023.6.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2239d95d8e243658b8dbb36b12bd10c33ad6e6933a54d36ff053713f129aa536"}, + {file = "regex-2023.6.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:890e5a11c97cf0d0c550eb661b937a1e45431ffa79803b942a057c4fb12a2da2"}, + {file = "regex-2023.6.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a8105e9af3b029f243ab11ad47c19b566482c150c754e4c717900a798806b222"}, + {file = "regex-2023.6.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:25be746a8ec7bc7b082783216de8e9473803706723b3f6bef34b3d0ed03d57e2"}, + {file = "regex-2023.6.3-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:3676f1dd082be28b1266c93f618ee07741b704ab7b68501a173ce7d8d0d0ca18"}, + {file = "regex-2023.6.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:10cb847aeb1728412c666ab2e2000ba6f174f25b2bdc7292e7dd71b16db07568"}, + {file = "regex-2023.6.3-cp310-cp310-win32.whl", hash = "sha256:dbbbfce33cd98f97f6bffb17801b0576e653f4fdb1d399b2ea89638bc8d08ae1"}, + {file = "regex-2023.6.3-cp310-cp310-win_amd64.whl", hash = "sha256:c5f8037000eb21e4823aa485149f2299eb589f8d1fe4b448036d230c3f4e68e0"}, + {file = "regex-2023.6.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c123f662be8ec5ab4ea72ea300359023a5d1df095b7ead76fedcd8babbedf969"}, + {file = "regex-2023.6.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9edcbad1f8a407e450fbac88d89e04e0b99a08473f666a3f3de0fd292badb6aa"}, + {file = "regex-2023.6.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcba6dae7de533c876255317c11f3abe4907ba7d9aa15d13e3d9710d4315ec0e"}, + {file = "regex-2023.6.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:29cdd471ebf9e0f2fb3cac165efedc3c58db841d83a518b082077e612d3ee5df"}, + {file = "regex-2023.6.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:12b74fbbf6cbbf9dbce20eb9b5879469e97aeeaa874145517563cca4029db65c"}, + {file = "regex-2023.6.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c29ca1bd61b16b67be247be87390ef1d1ef702800f91fbd1991f5c4421ebae8"}, + {file = "regex-2023.6.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d77f09bc4b55d4bf7cc5eba785d87001d6757b7c9eec237fe2af57aba1a071d9"}, + {file = "regex-2023.6.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ea353ecb6ab5f7e7d2f4372b1e779796ebd7b37352d290096978fea83c4dba0c"}, + {file = "regex-2023.6.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:10590510780b7541969287512d1b43f19f965c2ece6c9b1c00fc367b29d8dce7"}, + {file = "regex-2023.6.3-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e2fbd6236aae3b7f9d514312cdb58e6494ee1c76a9948adde6eba33eb1c4264f"}, + {file = "regex-2023.6.3-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:6b2675068c8b56f6bfd5a2bda55b8accbb96c02fd563704732fd1c95e2083461"}, + {file = "regex-2023.6.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:74419d2b50ecb98360cfaa2974da8689cb3b45b9deff0dcf489c0d333bcc1477"}, + {file = "regex-2023.6.3-cp311-cp311-win32.whl", hash = "sha256:fb5ec16523dc573a4b277663a2b5a364e2099902d3944c9419a40ebd56a118f9"}, + {file = "regex-2023.6.3-cp311-cp311-win_amd64.whl", hash = "sha256:09e4a1a6acc39294a36b7338819b10baceb227f7f7dbbea0506d419b5a1dd8af"}, + {file = "regex-2023.6.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:0654bca0cdf28a5956c83839162692725159f4cda8d63e0911a2c0dc76166525"}, + {file = "regex-2023.6.3-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:463b6a3ceb5ca952e66550a4532cef94c9a0c80dc156c4cc343041951aec1697"}, + {file = "regex-2023.6.3-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:87b2a5bb5e78ee0ad1de71c664d6eb536dc3947a46a69182a90f4410f5e3f7dd"}, + {file = "regex-2023.6.3-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6343c6928282c1f6a9db41f5fd551662310e8774c0e5ebccb767002fcf663ca9"}, + {file = "regex-2023.6.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6192d5af2ccd2a38877bfef086d35e6659566a335b1492786ff254c168b1693"}, + {file = "regex-2023.6.3-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74390d18c75054947e4194019077e243c06fbb62e541d8817a0fa822ea310c14"}, + {file = "regex-2023.6.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:742e19a90d9bb2f4a6cf2862b8b06dea5e09b96c9f2df1779e53432d7275331f"}, + {file = "regex-2023.6.3-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:8abbc5d54ea0ee80e37fef009e3cec5dafd722ed3c829126253d3e22f3846f1e"}, + {file = "regex-2023.6.3-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:c2b867c17a7a7ae44c43ebbeb1b5ff406b3e8d5b3e14662683e5e66e6cc868d3"}, + {file = "regex-2023.6.3-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:d831c2f8ff278179705ca59f7e8524069c1a989e716a1874d6d1aab6119d91d1"}, + {file = "regex-2023.6.3-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:ee2d1a9a253b1729bb2de27d41f696ae893507c7db224436abe83ee25356f5c1"}, + {file = "regex-2023.6.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:61474f0b41fe1a80e8dfa70f70ea1e047387b7cd01c85ec88fa44f5d7561d787"}, + {file = "regex-2023.6.3-cp36-cp36m-win32.whl", hash = "sha256:0b71e63226e393b534105fcbdd8740410dc6b0854c2bfa39bbda6b0d40e59a54"}, + {file = "regex-2023.6.3-cp36-cp36m-win_amd64.whl", hash = "sha256:bbb02fd4462f37060122e5acacec78e49c0fbb303c30dd49c7f493cf21fc5b27"}, + {file = "regex-2023.6.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b862c2b9d5ae38a68b92e215b93f98d4c5e9454fa36aae4450f61dd33ff48487"}, + {file = "regex-2023.6.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:976d7a304b59ede34ca2921305b57356694f9e6879db323fd90a80f865d355a3"}, + {file = "regex-2023.6.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:83320a09188e0e6c39088355d423aa9d056ad57a0b6c6381b300ec1a04ec3d16"}, + {file = "regex-2023.6.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9427a399501818a7564f8c90eced1e9e20709ece36be701f394ada99890ea4b3"}, + {file = "regex-2023.6.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7178bbc1b2ec40eaca599d13c092079bf529679bf0371c602edaa555e10b41c3"}, + {file = "regex-2023.6.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:837328d14cde912af625d5f303ec29f7e28cdab588674897baafaf505341f2fc"}, + {file = "regex-2023.6.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2d44dc13229905ae96dd2ae2dd7cebf824ee92bc52e8cf03dcead37d926da019"}, + {file = "regex-2023.6.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d54af539295392611e7efbe94e827311eb8b29668e2b3f4cadcfe6f46df9c777"}, + {file = "regex-2023.6.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:7117d10690c38a622e54c432dfbbd3cbd92f09401d622902c32f6d377e2300ee"}, + {file = "regex-2023.6.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bb60b503ec8a6e4e3e03a681072fa3a5adcbfa5479fa2d898ae2b4a8e24c4591"}, + {file = "regex-2023.6.3-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:65ba8603753cec91c71de423a943ba506363b0e5c3fdb913ef8f9caa14b2c7e0"}, + {file = "regex-2023.6.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:271f0bdba3c70b58e6f500b205d10a36fb4b58bd06ac61381b68de66442efddb"}, + {file = "regex-2023.6.3-cp37-cp37m-win32.whl", hash = "sha256:9beb322958aaca059f34975b0df135181f2e5d7a13b84d3e0e45434749cb20f7"}, + {file = "regex-2023.6.3-cp37-cp37m-win_amd64.whl", hash = "sha256:fea75c3710d4f31389eed3c02f62d0b66a9da282521075061ce875eb5300cf23"}, + {file = "regex-2023.6.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8f56fcb7ff7bf7404becdfc60b1e81a6d0561807051fd2f1860b0d0348156a07"}, + {file = "regex-2023.6.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d2da3abc88711bce7557412310dfa50327d5769a31d1c894b58eb256459dc289"}, + {file = "regex-2023.6.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a99b50300df5add73d307cf66abea093304a07eb017bce94f01e795090dea87c"}, + {file = "regex-2023.6.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5708089ed5b40a7b2dc561e0c8baa9535b77771b64a8330b684823cfd5116036"}, + {file = "regex-2023.6.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:687ea9d78a4b1cf82f8479cab23678aff723108df3edeac098e5b2498879f4a7"}, + {file = "regex-2023.6.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4d3850beab9f527f06ccc94b446c864059c57651b3f911fddb8d9d3ec1d1b25d"}, + {file = "regex-2023.6.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e8915cc96abeb8983cea1df3c939e3c6e1ac778340c17732eb63bb96247b91d2"}, + {file = "regex-2023.6.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:841d6e0e5663d4c7b4c8099c9997be748677d46cbf43f9f471150e560791f7ff"}, + {file = "regex-2023.6.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9edce5281f965cf135e19840f4d93d55b3835122aa76ccacfd389e880ba4cf82"}, + {file = "regex-2023.6.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:b956231ebdc45f5b7a2e1f90f66a12be9610ce775fe1b1d50414aac1e9206c06"}, + {file = "regex-2023.6.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:36efeba71c6539d23c4643be88295ce8c82c88bbd7c65e8a24081d2ca123da3f"}, + {file = "regex-2023.6.3-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:cf67ca618b4fd34aee78740bea954d7c69fdda419eb208c2c0c7060bb822d747"}, + {file = "regex-2023.6.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b4598b1897837067a57b08147a68ac026c1e73b31ef6e36deeeb1fa60b2933c9"}, + {file = "regex-2023.6.3-cp38-cp38-win32.whl", hash = "sha256:f415f802fbcafed5dcc694c13b1292f07fe0befdb94aa8a52905bd115ff41e88"}, + {file = "regex-2023.6.3-cp38-cp38-win_amd64.whl", hash = "sha256:d4f03bb71d482f979bda92e1427f3ec9b220e62a7dd337af0aa6b47bf4498f72"}, + {file = "regex-2023.6.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ccf91346b7bd20c790310c4147eee6ed495a54ddb6737162a36ce9dbef3e4751"}, + {file = "regex-2023.6.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b28f5024a3a041009eb4c333863d7894d191215b39576535c6734cd88b0fcb68"}, + {file = "regex-2023.6.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e0bb18053dfcfed432cc3ac632b5e5e5c5b7e55fb3f8090e867bfd9b054dbcbf"}, + {file = "regex-2023.6.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a5bfb3004f2144a084a16ce19ca56b8ac46e6fd0651f54269fc9e230edb5e4a"}, + {file = "regex-2023.6.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c6b48d0fa50d8f4df3daf451be7f9689c2bde1a52b1225c5926e3f54b6a9ed1"}, + {file = "regex-2023.6.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:051da80e6eeb6e239e394ae60704d2b566aa6a7aed6f2890a7967307267a5dc6"}, + {file = "regex-2023.6.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a4c3b7fa4cdaa69268748665a1a6ff70c014d39bb69c50fda64b396c9116cf77"}, + {file = "regex-2023.6.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:457b6cce21bee41ac292d6753d5e94dcbc5c9e3e3a834da285b0bde7aa4a11e9"}, + {file = "regex-2023.6.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:aad51907d74fc183033ad796dd4c2e080d1adcc4fd3c0fd4fd499f30c03011cd"}, + {file = "regex-2023.6.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:0385e73da22363778ef2324950e08b689abdf0b108a7d8decb403ad7f5191938"}, + {file = "regex-2023.6.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:c6a57b742133830eec44d9b2290daf5cbe0a2f1d6acee1b3c7b1c7b2f3606df7"}, + {file = "regex-2023.6.3-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:3e5219bf9e75993d73ab3d25985c857c77e614525fac9ae02b1bebd92f7cecac"}, + {file = "regex-2023.6.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e5087a3c59eef624a4591ef9eaa6e9a8d8a94c779dade95d27c0bc24650261cd"}, + {file = "regex-2023.6.3-cp39-cp39-win32.whl", hash = "sha256:20326216cc2afe69b6e98528160b225d72f85ab080cbdf0b11528cbbaba2248f"}, + {file = "regex-2023.6.3-cp39-cp39-win_amd64.whl", hash = "sha256:bdff5eab10e59cf26bc479f565e25ed71a7d041d1ded04ccf9aee1d9f208487a"}, + {file = "regex-2023.6.3.tar.gz", hash = "sha256:72d1a25bf36d2050ceb35b517afe13864865268dfb45910e2e17a84be6cbfeb0"}, +] + [[package]] name = "rich" version = "13.4.2" @@ -566,11 +799,80 @@ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-g testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +[[package]] +name = "sqlfluff" +version = "2.1.1" +description = "The SQL Linter for Humans" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "sqlfluff-2.1.1-py3-none-any.whl", hash = "sha256:7f303189ba3d1e50df5782c0bea974de8a25db0e6d1f0d1238f75b848c8d3f27"}, + {file = "sqlfluff-2.1.1.tar.gz", hash = "sha256:974a2eecb91ab3236b73097ea6c7532a96dd0dcafd4394fa2305b2845ef4d7d2"}, +] + +[package.dependencies] +appdirs = "*" +chardet = "*" +click = "*" +colorama = ">=0.3" +diff-cover = ">=2.5.0" +Jinja2 = "*" +pathspec = "*" +pytest = "*" +pyyaml = ">=5.1" +regex = "*" +tblib = "*" +toml = {version = "*", markers = "python_version < \"3.11\""} +tqdm = "*" +typing-extensions = "*" + +[[package]] +name = "sqlfluff-plugin-sparksql-upgrade" +version = "0.0.0" +description = "SQLFluff rules to help migrate your Spark SQL from 2.X to 3.X" +category = "main" +optional = false +python-versions = "*" +files = [] +develop = false + +[package.dependencies] +sqlfluff = ">=1.0.0" + +[package.source] +type = "directory" +url = "../sql" + +[[package]] +name = "tblib" +version = "2.0.0" +description = "Traceback serialization library." +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tblib-2.0.0-py3-none-any.whl", hash = "sha256:9100bfa016b047d5b980d66e7efed952fbd20bd85b56110aaf473cb97d18709a"}, + {file = "tblib-2.0.0.tar.gz", hash = "sha256:a6df30f272c08bf8be66e0775fad862005d950a6b8449b94f7c788731d70ecd7"}, +] + +[[package]] +name = "toml" +version = "0.10.2" +description = "Python Library for Tom's Obvious, Minimal Language" +category = "main" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, + {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, +] + [[package]] name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -578,6 +880,27 @@ files = [ {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +[[package]] +name = "tqdm" +version = "4.65.0" +description = "Fast, Extensible Progress Meter" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tqdm-4.65.0-py3-none-any.whl", hash = "sha256:c4f53a17fe37e132815abceec022631be8ffe1b9381c2e6e30aa70edc99e9671"}, + {file = "tqdm-4.65.0.tar.gz", hash = "sha256:1871fb68a86b8fb3b59ca4cdd3dcccbc7e6d613eeed31f4c332531977b89beb5"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["py-make (>=0.1.0)", "twine", "wheel"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + [[package]] name = "traitlets" version = "5.9.0" @@ -646,4 +969,4 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.3)", "coverage-enable-subprocess [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "0540b689361a3b1b1144ecb7b2f236de0196bf6414f78e95b1e2e5d87ec788d3" +content-hash = "ac647139b2d9874aac05344b01c1eb96ca9b6f23dc3ff9acac73a588b5a831d4" diff --git a/pysparkler/pyproject.toml b/pysparkler/pyproject.toml index 261a9e3..297e94e 100644 --- a/pysparkler/pyproject.toml +++ b/pysparkler/pyproject.toml @@ -18,6 +18,8 @@ libcst = "^1.0.1" click = "^8.1.3" rich = "^13.3.3" nbformat = "^5.8.0" +sqlfluff = "^2.1.1" +sqlfluff-plugin-sparksql-upgrade = {path = "../sql"} [tool.poetry.group.test.dependencies] @@ -57,3 +59,7 @@ ignore_missing_imports = true [[tool.mypy.overrides]] module = "nbformat.*" ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "sqlfluff.*" +ignore_missing_imports = true diff --git a/pysparkler/pysparkler/sql_21_to_33.py b/pysparkler/pysparkler/sql_21_to_33.py new file mode 100644 index 0000000..333d7e7 --- /dev/null +++ b/pysparkler/pysparkler/sql_21_to_33.py @@ -0,0 +1,91 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# # +# http://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +import libcst as cst +import libcst.matchers as m +import sqlfluff + +from pysparkler.base import StatementLineCommentWriter + +# Migration rules for SQL statements used in PySpark from 2.x to 3.3 +# https://spark.apache.org/docs/latest/sql-migration-guide.html + + +SPARK_SQL_DIALECT = "sparksql" +SPARK_SQL_CAST_RULE = "SPARKSQLCAST_L001" + + +class SqlStatementUpgradeAndCommentWriter(StatementLineCommentWriter): + """This migration rule uses the Sqlfluff plugin written in the adjacent SQL module and makes the best effort to + upcast SQL statements directly being executed from within a PySpark script. However, the upgrade won't be possible + for certain templated SQLs, and in those scenarios this tool will leave code hints for users as DIY instructions. + """ + + def __init__( + self, + ): + super().__init__( + transformer_id="PY21-33-001", + comment="Please note, PySparkler makes a best effort to upcast SQL statements directly being executed. \ +However, the upgrade won't be possible for certain templated SQLs, and in those scenarios please de-template the SQL \ +and use the Sqlfluff tooling to upcast the SQL yourself.", + ) + + def leave_Call(self, original_node: cst.Call, updated_node: cst.Call) -> cst.Call: + """Check if the call is a SQL statement and try to upcast it""" + print(f"******** Call node\n{original_node}") + if m.matches( + updated_node, + m.Call( + func=m.Attribute( + attr=m.Name("sql"), + ), + args=[ + m.Arg( + value=m.SimpleString(), + ) + ], + ), + ): + print(f"******** Match found\n{original_node}") + self.match_found = True + sql_node: cst.SimpleString = updated_node.args[0].value + sql = sql_node.evaluated_value + try: + updated_sql = self.do_fix(sql) + if updated_sql != sql: + updated_sql_value = ( + sql_node.prefix + sql_node.quote + updated_sql + sql_node.quote + ) + changes = updated_node.with_changes( + args=[cst.Arg(value=cst.SimpleString(value=updated_sql_value))] + ) + return changes + except Exception as e: # pylint: disable=broad-except + print(f"Failed to parse SQL: {sql} with error: {e}") + + return updated_node + + @staticmethod + def do_fix(sql: str) -> str: + return sqlfluff.fix( + sql, + dialect=SPARK_SQL_DIALECT, + rules=[SPARK_SQL_CAST_RULE], + fix_even_unparsable=True, + ) diff --git a/pysparkler/tests/test_sql_21_to_33.py b/pysparkler/tests/test_sql_21_to_33.py new file mode 100644 index 0000000..5ba1eee --- /dev/null +++ b/pysparkler/tests/test_sql_21_to_33.py @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# # +# http://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +from pysparkler.sql_21_to_33 import SqlStatementUpgradeAndCommentWriter +from tests.conftest import rewrite + + +def test_upgrades_non_templated_sql(): + given_code = """\ +from pyspark.sql import SparkSession + +spark = SparkSession.builder.appName("SQL Example").getOrCreate() +result = spark.sql("select cast(dateint as int) val from my_table limit 10") +spark.stop() +""" + modified_code = rewrite(given_code, SqlStatementUpgradeAndCommentWriter()) + expected_code = """\ +from pyspark.sql import SparkSession + +spark = SparkSession.builder.appName("SQL Example").getOrCreate() +result = spark.sql("select int(dateint) val from my_table limit 10") # PY21-33-001: Please note, PySparkler makes a best effort to upcast SQL statements directly being executed. However, the upgrade won't be possible for certain templated SQLs, and in those scenarios please de-template the SQL and use the Sqlfluff tooling to upcast the SQL yourself. # noqa: E501 +spark.stop() +""" + assert modified_code == expected_code From 395d70b7433fbda93e27013851287d085acd41cd Mon Sep 17 00:00:00 2001 From: Dhruv Pratap Date: Tue, 27 Jun 2023 13:00:45 -0400 Subject: [PATCH 2/5] PySparkler: Sqlfluff upgrades a formatted string SQL and provides code hint for ones with complex expressions within. --- pysparkler/pysparkler/sql_21_to_33.py | 91 ++++++++++++++++++++++----- pysparkler/tests/test_sql_21_to_33.py | 65 ++++++++++++++++++- 2 files changed, 139 insertions(+), 17 deletions(-) diff --git a/pysparkler/pysparkler/sql_21_to_33.py b/pysparkler/pysparkler/sql_21_to_33.py index 333d7e7..f182225 100644 --- a/pysparkler/pysparkler/sql_21_to_33.py +++ b/pysparkler/pysparkler/sql_21_to_33.py @@ -42,45 +42,104 @@ def __init__( super().__init__( transformer_id="PY21-33-001", comment="Please note, PySparkler makes a best effort to upcast SQL statements directly being executed. \ -However, the upgrade won't be possible for certain templated SQLs, and in those scenarios please de-template the SQL \ -and use the Sqlfluff tooling to upcast the SQL yourself.", +However, the upcast won't be possible for certain formatted string SQL having complex expressions within, and in those \ +cases please de-template the SQL and use the Sqlfluff tooling to upcast the SQL yourself.", ) + self.sql_upgraded = False def leave_Call(self, original_node: cst.Call, updated_node: cst.Call) -> cst.Call: """Check if the call is a SQL statement and try to upcast it""" - print(f"******** Call node\n{original_node}") if m.matches( - updated_node, + original_node, m.Call( func=m.Attribute( attr=m.Name("sql"), ), args=[ m.Arg( - value=m.SimpleString(), + value=m.OneOf( + m.SimpleString(), + m.FormattedString(), + m.ConcatenatedString(), + ) ) ], ), ): - print(f"******** Match found\n{original_node}") self.match_found = True - sql_node: cst.SimpleString = updated_node.args[0].value - sql = sql_node.evaluated_value + sql_node: cst.BaseExpression = updated_node.args[0].value try: - updated_sql = self.do_fix(sql) - if updated_sql != sql: - updated_sql_value = ( - sql_node.prefix + sql_node.quote + updated_sql + sql_node.quote + if isinstance(sql_node, cst.SimpleString): + updated_sql_node = self.update_simple_string_sql(sql_node) + elif isinstance(sql_node, cst.FormattedString): + updated_sql_node = self.update_formatted_string_sql(sql_node) + else: + raise NotImplementedError( + f"Unsupported SQL expression encountered : {sql_node}" ) - changes = updated_node.with_changes( - args=[cst.Arg(value=cst.SimpleString(value=updated_sql_value))] + + if self.sql_upgraded: + self.comment = "Spark SQL statement has been upgraded to Spark 3.3 compatible syntax." + self.sql_upgraded = False + else: + self.comment = ( + "Spark SQL statement has Spark 3.3 compatible syntax." ) - return changes + + return updated_node.with_changes(args=[cst.Arg(value=updated_sql_node)]) except Exception as e: # pylint: disable=broad-except - print(f"Failed to parse SQL: {sql} with error: {e}") + print(f"Failed to parse SQL: {sql_node} with error: {e}") + self.comment = "Unable to inspect the Spark SQL statement since the formatted string SQL has complex \ +expressions within. Please de-template the SQL and use the Sqlfluff tooling to upcast the SQL yourself." + self.sql_upgraded = False return updated_node + def update_simple_string_sql(self, sql_node: cst.SimpleString) -> cst.SimpleString: + sql = sql_node.evaluated_value + updated_sql = self.do_fix(sql) + if updated_sql != sql: + self.sql_upgraded = True + updated_sql_value = ( + sql_node.prefix + sql_node.quote + updated_sql + sql_node.quote + ) + return cst.SimpleString(value=updated_sql_value) + else: + return sql_node + + def update_formatted_string_sql( + self, sql_node: cst.FormattedString + ) -> cst.FormattedString: + # Form the raw SQL string by concatenating all the parts + sql = "" + for part in sql_node.parts: + if isinstance(part, cst.FormattedStringText): + sql += part.value + elif isinstance(part, cst.FormattedStringExpression) and isinstance( + part.expression, cst.Name + ): + sql += ( + part.whitespace_before_expression.value + + "{" + + part.expression.value + + "}" + + part.whitespace_after_expression.value + ) + else: + raise NotImplementedError( + f"Unsupported formatted string expression encountered : {part}" + ) + + updated_sql = self.do_fix(sql) + if updated_sql != sql: + self.sql_upgraded = True + updated_sql_value = ( + sql_node.prefix + sql_node.quote + updated_sql + sql_node.quote + ) + return cst.parse_expression(updated_sql_value) + else: + return sql_node + @staticmethod def do_fix(sql: str) -> str: return sqlfluff.fix( diff --git a/pysparkler/tests/test_sql_21_to_33.py b/pysparkler/tests/test_sql_21_to_33.py index 5ba1eee..c5f31a6 100644 --- a/pysparkler/tests/test_sql_21_to_33.py +++ b/pysparkler/tests/test_sql_21_to_33.py @@ -33,7 +33,70 @@ def test_upgrades_non_templated_sql(): from pyspark.sql import SparkSession spark = SparkSession.builder.appName("SQL Example").getOrCreate() -result = spark.sql("select int(dateint) val from my_table limit 10") # PY21-33-001: Please note, PySparkler makes a best effort to upcast SQL statements directly being executed. However, the upgrade won't be possible for certain templated SQLs, and in those scenarios please de-template the SQL and use the Sqlfluff tooling to upcast the SQL yourself. # noqa: E501 +result = spark.sql("select int(dateint) val from my_table limit 10") # PY21-33-001: Spark SQL statement has been upgraded to Spark 3.3 compatible syntax. # noqa: E501 +spark.stop() +""" + assert modified_code == expected_code + + +def test_upgrades_templated_sql(): + given_code = """\ +from pyspark.sql import SparkSession + +spark = SparkSession.builder.appName("SQL Example").getOrCreate() +table_name = "my_table" +result = spark.sql(f"select cast(dateint as int) val from {table_name} limit 10") +spark.stop() +""" + modified_code = rewrite(given_code, SqlStatementUpgradeAndCommentWriter()) + expected_code = """\ +from pyspark.sql import SparkSession + +spark = SparkSession.builder.appName("SQL Example").getOrCreate() +table_name = "my_table" +result = spark.sql(f"select int(dateint) val from {table_name} limit 10") # PY21-33-001: Spark SQL statement has been upgraded to Spark 3.3 compatible syntax. # noqa: E501 +spark.stop() +""" + assert modified_code == expected_code + + +def test_unable_to_upgrade_templated_sql_with_complex_expressions(): + given_code = """\ +from pyspark.sql import SparkSession + +spark = SparkSession.builder.appName("SQL Example").getOrCreate() +table_name = "my_table" +num = 10 +result = spark.sql(f"select cast(dateint as int) val from {table_name} where x < {num * 100} limit 10") +spark.stop() +""" + modified_code = rewrite(given_code, SqlStatementUpgradeAndCommentWriter()) + expected_code = """\ +from pyspark.sql import SparkSession + +spark = SparkSession.builder.appName("SQL Example").getOrCreate() +table_name = "my_table" +num = 10 +result = spark.sql(f"select cast(dateint as int) val from {table_name} where x < {num * 100} limit 10") # PY21-33-001: Unable to inspect the Spark SQL statement since the formatted string SQL has complex expressions within. Please de-template the SQL and use the Sqlfluff tooling to upcast the SQL yourself. # noqa: E501 +spark.stop() +""" + assert modified_code == expected_code + + +def test_no_upgrades_required_after_inspecting_sql(): + given_code = """\ +from pyspark.sql import SparkSession + +spark = SparkSession.builder.appName("SQL Example").getOrCreate() +result = spark.sql("select * from my_table limit 10") +spark.stop() +""" + modified_code = rewrite(given_code, SqlStatementUpgradeAndCommentWriter()) + expected_code = """\ +from pyspark.sql import SparkSession + +spark = SparkSession.builder.appName("SQL Example").getOrCreate() +result = spark.sql("select * from my_table limit 10") # PY21-33-001: Spark SQL statement has Spark 3.3 compatible syntax. # noqa: E501 spark.stop() """ assert modified_code == expected_code From c6bb99af19d04fa9ef01ed74481179ccb0301fbc Mon Sep 17 00:00:00 2001 From: Dhruv Pratap Date: Tue, 27 Jun 2023 14:00:43 -0400 Subject: [PATCH 3/5] PySparkler: Add upgrade-sql option via CLI to enable end-users to upgrade their de-templated SQLs themselves. --- pysparkler/poetry.lock | 68 +++++++++++++-------------- pysparkler/pyproject.toml | 2 +- pysparkler/pysparkler/cli.py | 26 ++++++++++ pysparkler/pysparkler/sql_21_to_33.py | 6 +-- pysparkler/tests/test_sql_21_to_33.py | 2 +- 5 files changed, 65 insertions(+), 39 deletions(-) diff --git a/pysparkler/poetry.lock b/pysparkler/poetry.lock index d85e184..343dc46 100644 --- a/pysparkler/poetry.lock +++ b/pysparkler/poetry.lock @@ -147,14 +147,14 @@ devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benc [[package]] name = "filelock" -version = "3.12.1" +version = "3.12.2" description = "A platform independent file lock." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "filelock-3.12.1-py3-none-any.whl", hash = "sha256:42f1e4ff2b497311213d61ad7aac5fed9050608e5309573f101eefa94143134a"}, - {file = "filelock-3.12.1.tar.gz", hash = "sha256:82b1f7da46f0ae42abf1bc78e548667f484ac59d2bcec38c713cee7e2eb51e83"}, + {file = "filelock-3.12.2-py3-none-any.whl", hash = "sha256:cbb791cdea2a72f23da6ac5b5269ab0a0d161e9ef0100e653b69049a7706d1ec"}, + {file = "filelock-3.12.2.tar.gz", hash = "sha256:002740518d8aa59a26b0c76e10fb8c6e15eae825d34b6fdf670333fd7b938d81"}, ] [package.extras] @@ -228,14 +228,14 @@ format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339- [[package]] name = "jupyter-core" -version = "5.3.0" +version = "5.3.1" description = "Jupyter core package. A base package on which Jupyter projects rely." category = "main" optional = false python-versions = ">=3.8" files = [ - {file = "jupyter_core-5.3.0-py3-none-any.whl", hash = "sha256:d4201af84559bc8c70cead287e1ab94aeef3c512848dde077b7684b54d67730d"}, - {file = "jupyter_core-5.3.0.tar.gz", hash = "sha256:6db75be0c83edbf1b7c9f91ec266a9a24ef945da630f3120e1a0046dc13713fc"}, + {file = "jupyter_core-5.3.1-py3-none-any.whl", hash = "sha256:ae9036db959a71ec1cac33081eeb040a79e681f08ab68b0883e9a676c7a90dce"}, + {file = "jupyter_core-5.3.1.tar.gz", hash = "sha256:5ba5c7938a7f97a6b0481463f7ff0dbac7c15ba48cf46fa4035ca6e838aa1aba"}, ] [package.dependencies] @@ -467,14 +467,14 @@ files = [ [[package]] name = "platformdirs" -version = "3.5.3" +version = "3.8.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "platformdirs-3.5.3-py3-none-any.whl", hash = "sha256:0ade98a4895e87dc51d47151f7d2ec290365a585151d97b4d8d6312ed6132fed"}, - {file = "platformdirs-3.5.3.tar.gz", hash = "sha256:e48fabd87db8f3a7df7150a4a5ea22c546ee8bc39bc2473244730d4b56d2cc4e"}, + {file = "platformdirs-3.8.0-py3-none-any.whl", hash = "sha256:ca9ed98ce73076ba72e092b23d3c93ea6c4e186b3f1c3dad6edd98ff6ffcca2e"}, + {file = "platformdirs-3.8.0.tar.gz", hash = "sha256:b0cabcb11063d21a0b261d557acb0a9d2126350e63b70cdf7db6347baea456dc"}, ] [package.extras] @@ -483,14 +483,14 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest- [[package]] name = "pluggy" -version = "1.0.0" +version = "1.2.0" description = "plugin and hook calling mechanisms for python" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, - {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, + {file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"}, + {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"}, ] [package.extras] @@ -499,14 +499,14 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "pre-commit" -version = "3.3.2" +version = "3.3.3" description = "A framework for managing and maintaining multi-language pre-commit hooks." category = "dev" optional = false python-versions = ">=3.8" files = [ - {file = "pre_commit-3.3.2-py2.py3-none-any.whl", hash = "sha256:8056bc52181efadf4aac792b1f4f255dfd2fb5a350ded7335d251a68561e8cb6"}, - {file = "pre_commit-3.3.2.tar.gz", hash = "sha256:66e37bec2d882de1f17f88075047ef8962581f83c234ac08da21a0c58953d1f0"}, + {file = "pre_commit-3.3.3-py2.py3-none-any.whl", hash = "sha256:10badb65d6a38caff29703362271d7dca483d01da88f9d7e05d0b97171c136cb"}, + {file = "pre_commit-3.3.3.tar.gz", hash = "sha256:a2256f489cd913d575c145132ae196fe335da32d91a8294b7afe6622335dd023"}, ] [package.dependencies] @@ -570,14 +570,14 @@ files = [ [[package]] name = "pytest" -version = "7.3.2" +version = "7.4.0" description = "pytest: simple powerful testing with Python" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.3.2-py3-none-any.whl", hash = "sha256:cdcbd012c9312258922f8cd3f1b62a6580fdced17db6014896053d47cddf9295"}, - {file = "pytest-7.3.2.tar.gz", hash = "sha256:ee990a3cc55ba808b80795a79944756f315c67c12b56abd3ac993a7b8c17030b"}, + {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"}, + {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"}, ] [package.dependencies] @@ -784,14 +784,14 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] [[package]] name = "setuptools" -version = "67.8.0" +version = "68.0.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "setuptools-67.8.0-py3-none-any.whl", hash = "sha256:5df61bf30bb10c6f756eb19e7c9f3b473051f48db77fddbe06ff2ca307df9a6f"}, - {file = "setuptools-67.8.0.tar.gz", hash = "sha256:62642358adc77ffa87233bc4d2354c4b2682d214048f500964dbe760ccedf102"}, + {file = "setuptools-68.0.0-py3-none-any.whl", hash = "sha256:11e52c67415a381d10d6b462ced9cfb97066179f0e871399e006c4ab101fc85f"}, + {file = "setuptools-68.0.0.tar.gz", hash = "sha256:baf1fdb41c6da4cd2eae722e135500da913332ab3f2f5c7d33af9b492acb5235"}, ] [package.extras] @@ -801,14 +801,14 @@ testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs ( [[package]] name = "sqlfluff" -version = "2.1.1" +version = "1.4.5" description = "The SQL Linter for Humans" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "sqlfluff-2.1.1-py3-none-any.whl", hash = "sha256:7f303189ba3d1e50df5782c0bea974de8a25db0e6d1f0d1238f75b848c8d3f27"}, - {file = "sqlfluff-2.1.1.tar.gz", hash = "sha256:974a2eecb91ab3236b73097ea6c7532a96dd0dcafd4394fa2305b2845ef4d7d2"}, + {file = "sqlfluff-1.4.5-py3-none-any.whl", hash = "sha256:10b193ee009046f8fe3af9d1895f767b691635653cccc8698bc01eee2ccb69df"}, + {file = "sqlfluff-1.4.5.tar.gz", hash = "sha256:68a8abde2fa22b76a20d908e22087e2e12b0e4a2085be2ea955ff6305b0fea30"}, ] [package.dependencies] @@ -823,7 +823,7 @@ pytest = "*" pyyaml = ">=5.1" regex = "*" tblib = "*" -toml = {version = "*", markers = "python_version < \"3.11\""} +toml = "*" tqdm = "*" typing-extensions = "*" @@ -947,26 +947,26 @@ typing-extensions = ">=3.7.4" [[package]] name = "virtualenv" -version = "20.23.0" +version = "20.23.1" description = "Virtual Python Environment builder" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "virtualenv-20.23.0-py3-none-any.whl", hash = "sha256:6abec7670e5802a528357fdc75b26b9f57d5d92f29c5462ba0fbe45feacc685e"}, - {file = "virtualenv-20.23.0.tar.gz", hash = "sha256:a85caa554ced0c0afbd0d638e7e2d7b5f92d23478d05d17a76daeac8f279f924"}, + {file = "virtualenv-20.23.1-py3-none-any.whl", hash = "sha256:34da10f14fea9be20e0fd7f04aba9732f84e593dac291b757ce42e3368a39419"}, + {file = "virtualenv-20.23.1.tar.gz", hash = "sha256:8ff19a38c1021c742148edc4f81cb43d7f8c6816d2ede2ab72af5b84c749ade1"}, ] [package.dependencies] distlib = ">=0.3.6,<1" -filelock = ">=3.11,<4" -platformdirs = ">=3.2,<4" +filelock = ">=3.12,<4" +platformdirs = ">=3.5.1,<4" [package.extras] -docs = ["furo (>=2023.3.27)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=22.12)"] -test = ["covdefaults (>=2.3)", "coverage (>=7.2.3)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.3.1)", "pytest-env (>=0.8.1)", "pytest-freezegun (>=0.4.2)", "pytest-mock (>=3.10)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=67.7.1)", "time-machine (>=2.9)"] +docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] +test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.3.1)", "pytest-env (>=0.8.1)", "pytest-freezer (>=0.4.6)", "pytest-mock (>=3.10)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=67.8)", "time-machine (>=2.9)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "ac647139b2d9874aac05344b01c1eb96ca9b6f23dc3ff9acac73a588b5a831d4" +content-hash = "3099d2a7d65e0caadd21d1d3bfc65f214437c08154cba79076a994aff27cdc10" diff --git a/pysparkler/pyproject.toml b/pysparkler/pyproject.toml index 297e94e..8493631 100644 --- a/pysparkler/pyproject.toml +++ b/pysparkler/pyproject.toml @@ -18,7 +18,7 @@ libcst = "^1.0.1" click = "^8.1.3" rich = "^13.3.3" nbformat = "^5.8.0" -sqlfluff = "^2.1.1" +sqlfluff = "^1.0.0" sqlfluff-plugin-sparksql-upgrade = {path = "../sql"} diff --git a/pysparkler/pysparkler/cli.py b/pysparkler/pysparkler/cli.py index 415f31d..6a5e2ff 100644 --- a/pysparkler/pysparkler/cli.py +++ b/pysparkler/pysparkler/cli.py @@ -30,6 +30,7 @@ from rich.table import Table from pysparkler.api import PySparkler +from pysparkler.sql_21_to_33 import SqlStatementUpgradeAndCommentWriter stdout = Console() stderr = Console(stderr=True) @@ -187,6 +188,31 @@ def version(ctx: Context) -> None: stdout.print(table) +@run.command() +@click.pass_context +@catch_exception() +def upgrade_sql(ctx: Context) -> None: + """Upgrades a non-templated Spark SQL statement read from stdin to be compatible with the latest Spark version. + + Examples: \n + echo "SELECT * FROM table" | pysparkler upgrade-sql \n + cat /path/to/input.sql | pysparkler upgrade-sql + """ + + # Read SQL from stdin + input_sql = click.get_text_stream("stdin").read() + if ctx.obj["verbose"]: + stdout.rule("Input SQL") + stdout.print(Syntax(input_sql, "sql")) + + # Upcast the SQL to be compatible with the latest Spark version + output_sql = SqlStatementUpgradeAndCommentWriter.do_fix(input_sql) + + # Output the upcasted SQL to stdout + stdout.rule("Output SQL") + stdout.print(Syntax(output_sql, "sql")) + + def print_command_params(ctx): """Prints the command params""" if ctx.obj["verbose"]: diff --git a/pysparkler/pysparkler/sql_21_to_33.py b/pysparkler/pysparkler/sql_21_to_33.py index f182225..4f3a8ea 100644 --- a/pysparkler/pysparkler/sql_21_to_33.py +++ b/pysparkler/pysparkler/sql_21_to_33.py @@ -87,10 +87,10 @@ def leave_Call(self, original_node: cst.Call, updated_node: cst.Call) -> cst.Cal ) return updated_node.with_changes(args=[cst.Arg(value=updated_sql_node)]) - except Exception as e: # pylint: disable=broad-except - print(f"Failed to parse SQL: {sql_node} with error: {e}") + except Exception: # pylint: disable=broad-except self.comment = "Unable to inspect the Spark SQL statement since the formatted string SQL has complex \ -expressions within. Please de-template the SQL and use the Sqlfluff tooling to upcast the SQL yourself." +expressions within. Please de-template the SQL and use the 'pysparkler upgrade-sql' CLI command to upcast the SQL \ +yourself." self.sql_upgraded = False return updated_node diff --git a/pysparkler/tests/test_sql_21_to_33.py b/pysparkler/tests/test_sql_21_to_33.py index c5f31a6..0b3a141 100644 --- a/pysparkler/tests/test_sql_21_to_33.py +++ b/pysparkler/tests/test_sql_21_to_33.py @@ -77,7 +77,7 @@ def test_unable_to_upgrade_templated_sql_with_complex_expressions(): spark = SparkSession.builder.appName("SQL Example").getOrCreate() table_name = "my_table" num = 10 -result = spark.sql(f"select cast(dateint as int) val from {table_name} where x < {num * 100} limit 10") # PY21-33-001: Unable to inspect the Spark SQL statement since the formatted string SQL has complex expressions within. Please de-template the SQL and use the Sqlfluff tooling to upcast the SQL yourself. # noqa: E501 +result = spark.sql(f"select cast(dateint as int) val from {table_name} where x < {num * 100} limit 10") # PY21-33-001: Unable to inspect the Spark SQL statement since the formatted string SQL has complex expressions within. Please de-template the SQL and use the 'pysparkler upgrade-sql' CLI command to upcast the SQL yourself. # noqa: E501 spark.stop() """ assert modified_code == expected_code From 75d687b9209507a05643928c64eeb71cf70d7e2a Mon Sep 17 00:00:00 2001 From: Dhruv Pratap Date: Tue, 27 Jun 2023 14:07:59 -0400 Subject: [PATCH 4/5] PySparkler: Integrate Sqlfluff upcast with the main upgrade API. --- pysparkler/pysparkler/api.py | 2 ++ pysparkler/pysparkler/sql_21_to_33.py | 7 +++++++ pysparkler/tests/sample/InputPySparkNotebook.ipynb | 3 +++ pysparkler/tests/sample/OutputPySparkNotebook.ipynb | 3 +++ pysparkler/tests/sample/input_pyspark.py | 3 +++ pysparkler/tests/sample/output_pyspark.py | 3 +++ 6 files changed, 21 insertions(+) diff --git a/pysparkler/pysparkler/api.py b/pysparkler/pysparkler/api.py index c8cb979..0d10adb 100644 --- a/pysparkler/pysparkler/api.py +++ b/pysparkler/pysparkler/api.py @@ -27,6 +27,7 @@ from pysparkler.pyspark_24_to_30 import pyspark_24_to_30_transformers from pysparkler.pyspark_31_to_32 import pyspark_31_to_32_transformers from pysparkler.pyspark_32_to_33 import pyspark_32_to_33_transformers +from pysparkler.sql_21_to_33 import sql_21_to_33_transformers class PySparkler: @@ -53,6 +54,7 @@ def transformers(self) -> list[BaseTransformer]: *pyspark_24_to_30_transformers(), *pyspark_31_to_32_transformers(), *pyspark_32_to_33_transformers(), + *sql_21_to_33_transformers(), ] # Override the default values of the transformers with the user provided values for transformer in all_transformers: diff --git a/pysparkler/pysparkler/sql_21_to_33.py b/pysparkler/pysparkler/sql_21_to_33.py index 4f3a8ea..bd94e18 100644 --- a/pysparkler/pysparkler/sql_21_to_33.py +++ b/pysparkler/pysparkler/sql_21_to_33.py @@ -148,3 +148,10 @@ def do_fix(sql: str) -> str: rules=[SPARK_SQL_CAST_RULE], fix_even_unparsable=True, ) + + +def sql_21_to_33_transformers() -> list[cst.CSTTransformer]: + """Return a list of transformers for SQL 2.1 to 3.3 migration guide""" + return [ + SqlStatementUpgradeAndCommentWriter(), + ] diff --git a/pysparkler/tests/sample/InputPySparkNotebook.ipynb b/pysparkler/tests/sample/InputPySparkNotebook.ipynb index 6464293..c181b39 100644 --- a/pysparkler/tests/sample/InputPySparkNotebook.ipynb +++ b/pysparkler/tests/sample/InputPySparkNotebook.ipynb @@ -20,6 +20,9 @@ "spark = SparkSession.builder.appName('example').getOrCreate()\n", "spark.conf.set(\"spark.sql.execution.arrow.enabled\", \"true\")\n", "\n", + "table_name = \"my_table\"\n", + "result = spark.sql(f\"select cast(dateint as int) val from {table_name} limit 10\")\n", + "\n", "data = [(\"James\", \"\", \"Smith\", \"36636\", \"M\", 60000),\n", " (\"Jen\", \"Mary\", \"Brown\", \"\", \"F\", 0)]\n", "\n", diff --git a/pysparkler/tests/sample/OutputPySparkNotebook.ipynb b/pysparkler/tests/sample/OutputPySparkNotebook.ipynb index 2d2fe53..a165d15 100644 --- a/pysparkler/tests/sample/OutputPySparkNotebook.ipynb +++ b/pysparkler/tests/sample/OutputPySparkNotebook.ipynb @@ -20,6 +20,9 @@ "spark = SparkSession.builder.appName('example').getOrCreate()\n", "spark.conf.set(\"spark.sql.execution.arrow.enabled\", \"true\") # PY24-30-004: PySpark 3.0 requires PyArrow version 0.12.1 or higher when spark.sql.execution.arrow.enabled is set to true # PY24-30-005: Consider setting spark.sql.execution.pandas.convertToArrowArraySafely to true to raise errors in case of Integer overflow or Floating point truncation, instead of silent allows. # noqa: E501\n", "\n", + "table_name = \"my_table\"\n", + "result = spark.sql(f\"select int(dateint) val from {table_name} limit 10\") # PY21-33-001: Spark SQL statement has been upgraded to Spark 3.3 compatible syntax. # noqa: E501\n", + "\n", "data = [(\"James\", \"\", \"Smith\", \"36636\", \"M\", 60000),\n", " (\"Jen\", \"Mary\", \"Brown\", \"\", \"F\", 0)]\n", "\n", diff --git a/pysparkler/tests/sample/input_pyspark.py b/pysparkler/tests/sample/input_pyspark.py index 785820e..d829a6d 100644 --- a/pysparkler/tests/sample/input_pyspark.py +++ b/pysparkler/tests/sample/input_pyspark.py @@ -11,6 +11,9 @@ spark = SparkSession.builder.appName('example').getOrCreate() spark.conf.set("spark.sql.execution.arrow.enabled", "true") +table_name = "my_table" +result = spark.sql(f"select cast(dateint as int) val from {table_name} limit 10") + data = [("James", "", "Smith", "36636", "M", 60000), ("Jen", "Mary", "Brown", "", "F", 0)] diff --git a/pysparkler/tests/sample/output_pyspark.py b/pysparkler/tests/sample/output_pyspark.py index c4231fc..b415a1a 100644 --- a/pysparkler/tests/sample/output_pyspark.py +++ b/pysparkler/tests/sample/output_pyspark.py @@ -11,6 +11,9 @@ spark = SparkSession.builder.appName('example').getOrCreate() spark.conf.set("spark.sql.execution.arrow.enabled", "true") # PY24-30-004: PySpark 3.0 requires PyArrow version 0.12.1 or higher when spark.sql.execution.arrow.enabled is set to true # PY24-30-005: Consider setting spark.sql.execution.pandas.convertToArrowArraySafely to true to raise errors in case of Integer overflow or Floating point truncation, instead of silent allows. # noqa: E501 +table_name = "my_table" +result = spark.sql(f"select int(dateint) val from {table_name} limit 10") # PY21-33-001: Spark SQL statement has been upgraded to Spark 3.3 compatible syntax. # noqa: E501 + data = [("James", "", "Smith", "36636", "M", 60000), ("Jen", "Mary", "Brown", "", "F", 0)] From 71e1325eb838dedc9f738646cb7b0b99cdc6af7f Mon Sep 17 00:00:00 2001 From: Dhruv Pratap Date: Tue, 27 Jun 2023 15:24:40 -0400 Subject: [PATCH 5/5] PySparkler: Add documentation for upgrade-sql command. --- pysparkler/README.md | 27 +++++++++++++++++++++++++++ pysparkler/pysparkler/cli.py | 19 ++++++++++++++----- pysparkler/pysparkler/sql_21_to_33.py | 8 ++++++++ 3 files changed, 49 insertions(+), 5 deletions(-) diff --git a/pysparkler/README.md b/pysparkler/README.md index d0712b4..041acd6 100644 --- a/pysparkler/README.md +++ b/pysparkler/README.md @@ -67,6 +67,7 @@ to upgrade your PySpark scripts. In the latest stable version it supports the fo | Upgrading from PySpark 2.3 to 2.4 | ✅ | [Link](https://spark.apache.org/docs/latest/api/python/migration_guide/pyspark_upgrade.html#upgrading-from-pyspark-2-3-to-2-4) | | Upgrading from PySpark 2.3.0 to 2.3.1 and above | ✅ | [Link](https://spark.apache.org/docs/latest/api/python/migration_guide/pyspark_upgrade.html#upgrading-from-pyspark-2-3-0-to-2-3-1-and-above) | | Upgrading from PySpark 2.2 to 2.3 | ✅ | [Link](https://spark.apache.org/docs/latest/api/python/migration_guide/pyspark_upgrade.html#upgrading-from-pyspark-2-2-to-2-3) | +| Upgrading from PySpark 2.1 to 2.2 | ✅ | NA | | Upgrading from PySpark 1.4 to 1.5 | ❌ | [Link](https://spark.apache.org/docs/latest/api/python/migration_guide/pyspark_upgrade.html#upgrading-from-pyspark-1-4-to-1-5) | | Upgrading from PySpark 1.0-1.2 to 1.3 | ❌ | [Link](https://spark.apache.org/docs/latest/api/python/migration_guide/pyspark_upgrade.html#upgrading-from-pyspark-1-0-1-2-to-1-3) | @@ -78,6 +79,7 @@ The tool supports the following features: |-----------------------------------------------|-----------| | Upgrade PySpark Python script | ✅ | | Upgrade PySpark Jupyter Notebook | ✅ | +| Upgrade SQL | ✅ | | Dry-run Mode | ✅ | | Verbose Mode | ✅ | | Customize code transformers using YAML config | ✅ | @@ -118,6 +120,31 @@ To change the output kernel name in the output Jupyter notebook, you can use the pysparkler upgrade --input-file /path/to/notebook.ipynb --output-kernel spark33-python3 ``` +### Upgrade SQL + +PySparkler when encounters a SQL statement in the input script makes an attempt to upgrade them. However, it is not +always possible to upgrade certain formatted string SQL statements that have complex expressions within. In such +cases the tool does leave code hints to let users know that they need to upgrade the SQL themselves. + +To facilitate this, it exposes a command `upgrade-sql` for users to perform this DIY. The steps for that include: + +1. De-template the SQL. +1. Upgrade the de-templated SQL using `pysparkler upgrade-sql`. See below for details. +1. Re-template the upgraded SQL. +1. Replace the old SQL with the upgraded SQL in the input script. + +In order to perform step #2 i.e. you can either echo the SQL statement and pipe it to the tool: + +```bash +echo "SELECT * FROM table" | pysparkler upgrade-sql +``` + +or you can use the `cat` command to pipe the SQL statement to the tool: + +```bash +cat /path/to/sql.sql | pysparkler upgrade-sql +``` + ### Dry-Run Mode For both the above upgrade options, to run in dry mode, you can use the `--dry-run` flag. This will not write the diff --git a/pysparkler/pysparkler/cli.py b/pysparkler/pysparkler/cli.py index 6a5e2ff..4f0a5bf 100644 --- a/pysparkler/pysparkler/cli.py +++ b/pysparkler/pysparkler/cli.py @@ -203,14 +203,23 @@ def upgrade_sql(ctx: Context) -> None: input_sql = click.get_text_stream("stdin").read() if ctx.obj["verbose"]: stdout.rule("Input SQL") - stdout.print(Syntax(input_sql, "sql")) + stdout.print(Syntax(input_sql, "sql", line_numbers=True)) - # Upcast the SQL to be compatible with the latest Spark version + # Ensure SQL is parsable and upcast the SQL to be compatible with the latest Spark version + SqlStatementUpgradeAndCommentWriter.do_parse(input_sql) output_sql = SqlStatementUpgradeAndCommentWriter.do_fix(input_sql) - # Output the upcasted SQL to stdout - stdout.rule("Output SQL") - stdout.print(Syntax(output_sql, "sql")) + if input_sql == output_sql: + stdout.print("No upgrades detected in Input SQL", style="green") + else: + # Output the upcasted SQL to stdout + stdout.rule("Output SQL") + stdout.print(Syntax(output_sql, "sql", line_numbers=True)) + stdout.rule("Unified Diff") + diff = difflib.unified_diff(input_sql.splitlines(), output_sql.splitlines()) + for line in diff: + stdout.print(Syntax(line, "sql")) + stdout.rule("End of Diff") def print_command_params(ctx): diff --git a/pysparkler/pysparkler/sql_21_to_33.py b/pysparkler/pysparkler/sql_21_to_33.py index bd94e18..096ffd8 100644 --- a/pysparkler/pysparkler/sql_21_to_33.py +++ b/pysparkler/pysparkler/sql_21_to_33.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. # +from typing import Any import libcst as cst import libcst.matchers as m @@ -149,6 +150,13 @@ def do_fix(sql: str) -> str: fix_even_unparsable=True, ) + @staticmethod + def do_parse(sql: str) -> dict[str, Any]: + return sqlfluff.parse( + sql, + dialect=SPARK_SQL_DIALECT, + ) + def sql_21_to_33_transformers() -> list[cst.CSTTransformer]: """Return a list of transformers for SQL 2.1 to 3.3 migration guide"""