added run and edited

rearc-data · Jul 23, 2021 · cad70c9 · cad70c9
1 parent 9ff584f
commit cad70c9
Show file tree

Hide file tree

Showing 9 changed files with 365 additions and 298 deletions.
diff --git a/.gitignore b/.gitignore
@@ -181,3 +181,7 @@ $RECYCLE.BIN/
 **/pre-processing-code.zip
 
 response.json
+
+### PyCharm
+.idea/
+venv/
diff --git a/init.sh b/init.sh
@@ -19,16 +19,47 @@ while [[ $# -gt 0 ]]; do
  shift;
  current_arg="$1"
  case ${opt} in
- "-s"|"--s3-bucket") export S3_BUCKET="$1"; shift;;
- "-d"|"--dataset-name") export DATASET_NAME="$1"; shift;;
- "-p"|"--product-name") export PRODUCT_NAME="$1"; shift;;
- "-i"|"--product-id") export PRODUCT_ID="$1"; shift;;
- "-r"|"--region") export REGION="$1"; shift;;
- "-f"|"--profile") PROFILE=" --profile $1"; shift;;
+ "--rdp-role-arn") export REARC_DATA_PLATFORM_ROLE_ARN="$1"; shift;;
+ "--rdp-external-id") export REARC_DATA_PLATFORM_EXTERNAL_ID="$1"; shift;;
+ "--customer-id") export CUSTOMER_ID="$1"; shift;;
+ "--schedule-cron") export SCHEDULE_CRON="$1"; shift;;
+ "--asset-bucket") export ASSET_BUCKET="$1"; shift;;
+ "--manifest-bucket") export MANIFEST_BUCKET="$1"; shift;;
+ "--dataset-name") export DATASET_NAME="$1"; shift;;
+ "--product-name") export PRODUCT_NAME="$1"; shift;;
+ "--product-id") export PRODUCT_ID="$1"; shift;;
+ "--dataset-arn") export DATASET_ARN="$1"; shift;;
+ "--region") export REGION="$1"; shift;;
+ "--first-revision") export FIRST_REVISION="$1"; shift;;
+ "--products-info-file") export PRODUCTS_INFO_FILE="$1"; shift;;
+ "--source-url") export SOURCE_URL="$1"; shift;;
+ "--product-code") export PRODUCT_CODE="$1"; shift;;
+ "--product-url") export PRODUCT_URL="$1"; shift;;
+ "--profile") PROFILE=" --profile $1"; shift;;
  *) echo "ERROR: Invalid option: \""$opt"\"" >&2; exit 1;;
  esac
 done
 
+echo "------------------------------------------------------------------------------"
+echo "REARC_DATA_PLATFORM_ROLE_ARN: $REARC_DATA_PLATFORM_ROLE_ARN"
+echo "REARC_DATA_PLATFORM_EXTERNAL_ID: $REARC_DATA_PLATFORM_EXTERNAL_ID"
+echo "CUSTOMER_ID: $CUSTOMER_ID"
+echo "ASSET_BUCKET: $ASSET_BUCKET"
+echo "MANIFEST_BUCKET: $MANIFEST_BUCKET"
+echo "DATASET_NAME: $DATASET_NAME"
+echo "DATASET_ARN: $DATASET_ARN"
+echo "PRODUCT_NAME: $PRODUCT_NAME"
+echo "PRODUCT_ID: $PRODUCT_ID"
+echo "SCHEDULE_CRON: $SCHEDULE_CRON"
+echo "REGION: $REGION"
+echo "PROFILE: $PROFILE"
+echo "PRODUCTS_INFO_FILE: $PRODUCTS_INFO_FILE"
+echo "SOURCE_URL: $SOURCE_URL"
+echo "PRODUCT_CODE: $PRODUCT_CODE"
+echo "PRODUCT_URL: $PRODUCT_URL"
+echo "FIRST_REVISION: $FIRST_REVISION"
+echo "------------------------------------------------------------------------------"
+
 while [[ ${#DATASET_NAME} -gt 53 ]]; do
  echo "dataset-name must be under 53 characters in length, enter a shorter name:"
  read -p "New dataset-name: " DATASET_NAME
@@ -47,109 +78,68 @@ while [[ ${#PRODUCT_NAME} -gt 72 ]]; do
  esac
 done
 
-#creating a pre-processing zip package, these commands may need to be adjusted depending on folder structure and dependencies
+echo "creating a pre-processing zip package, these commands may need to be adjusted depending on folder structure and dependencies"
 (cd pre-processing/pre-processing-code && zip -r pre-processing-code.zip . -x "*.dist-info/*" -x "bin/*" -x "**/__pycache__/*")
 
-#upload pre-preprocessing.zip to s3
 echo "uploading pre-preprocessing.zip to s3"
-aws s3 cp pre-processing/pre-processing-code/pre-processing-code.zip s3:https://$S3_BUCKET/$DATASET_NAME/automation/pre-processing-code.zip --region $REGION$PROFILE
-
-#creating dataset on ADX
-echo "creating dataset on ADX"
-DATASET_COMMAND="aws dataexchange create-data-set --asset-type "S3_SNAPSHOT" --description file:https://dataset-description.md --name \"${PRODUCT_NAME}\" --region $REGION --output json$PROFILE"
-DATASET_OUTPUT=$(eval $DATASET_COMMAND)
-DATASET_ARN=$(echo $DATASET_OUTPUT | tr '\r\n' ' ' | jq -r '.Arn')
-DATASET_ID=$(echo $DATASET_OUTPUT | tr '\r\n' ' ' | jq -r '.Id')
-
-#creating pre-processing cloudformation stack
-echo "creating pre-processing cloudformation stack"
-CFN_STACK_NAME="producer-${DATASET_NAME}-preprocessing"
-aws cloudformation create-stack --stack-name $CFN_STACK_NAME --template-body file:https://pre-processing/pre-processing-cfn.yaml --parameters ParameterKey=S3Bucket,ParameterValue=$S3_BUCKET ParameterKey=DataSetName,ParameterValue=$DATASET_NAME ParameterKey=DataSetArn,ParameterValue=$DATASET_ARN ParameterKey=ProductId,ParameterValue=$PRODUCT_ID ParameterKey=Region,ParameterValue=$REGION --region $REGION --capabilities "CAPABILITY_AUTO_EXPAND" "CAPABILITY_NAMED_IAM" "CAPABILITY_IAM"$PROFILE
-
-echo "waiting for cloudformation stack to complete"
-aws cloudformation wait stack-create-complete --stack-name $CFN_STACK_NAME --region $REGION$PROFILE
-
-if [[ $? -ne 0 ]]
-then
- # Cloudformation stack created
- echo "Cloudformation stack creation failed"
- exit 1
-fi
-
-#invoking the pre-processing lambda function to create first dataset revision
-echo "invoking the pre-processing lambda function to create first dataset revision"
-LAMBDA_FUNCTION_NAME="source-for-${DATASET_NAME}"
-# AWS CLI version 2 changes require explicitly declairing `--cli-binary-format raw-in-base64-out` for the format of the `--payload`
-LAMBDA_FUNCTION_STATUS_CODE=$(aws lambda invoke --function-name $LAMBDA_FUNCTION_NAME --invocation-type "RequestResponse" --payload '{ "test": "event" }' response.json --cli-binary-format raw-in-base64-out --region $REGION --query 'StatusCode' --output text$PROFILE)
+aws s3 cp pre-processing/pre-processing-code/pre-processing-code.zip s3:https://$ASSET_BUCKET/$DATASET_NAME/automation/pre-processing-code.zip --region "$REGION" $PROFILE
 
-#grabbing dataset revision status
-echo "grabbing dataset revision status"
-DATASET_REVISION_STATUS=$(aws dataexchange list-data-set-revisions --data-set-id $DATASET_ID --region $REGION --query "sort_by(Revisions, &CreatedAt)[-1].Finalized"$PROFILE)
+if [[ "$FIRST_REVISION" == "true" ]]; then
+ echo "creating dataset on ADX"
+ DATASET_COMMAND="aws dataexchange create-data-set --asset-type "S3_SNAPSHOT" --description file:https://dataset-description.md --name \"${PRODUCT_NAME}\" --region $REGION --output json $PROFILE"
+ DATASET_OUTPUT=$(eval $DATASET_COMMAND)
+ DATASET_ARN=$(echo $DATASET_OUTPUT | tr '\r\n' ' ' | jq -r '.Arn')
+ DATASET_ID=$(echo $DATASET_OUTPUT | tr '\r\n' ' ' | jq -r '.Id')
 
-update () {
- echo ""
- echo "Manually create the ADX product and enter in the Product ID below:"
- read -p "Product ID: " NEW_PRODUCT_ID
-
- # Cloudformation stack update
- echo "updating pre-processing cloudformation stack"
- aws cloudformation update-stack --stack-name $CFN_STACK_NAME --use-previous-template --parameters ParameterKey=S3Bucket,ParameterValue=$S3_BUCKET ParameterKey=DataSetName,ParameterValue=$DATASET_NAME ParameterKey=DataSetArn,ParameterValue=$DATASET_ARN ParameterKey=ProductId,ParameterValue=$NEW_PRODUCT_ID ParameterKey=Region,ParameterValue=$REGION --region $REGION --capabilities "CAPABILITY_AUTO_EXPAND" "CAPABILITY_NAMED_IAM" "CAPABILITY_IAM"$PROFILE
-
- echo "waiting for cloudformation stack update to complete"
- aws cloudformation wait stack-update-complete --stack-name $CFN_STACK_NAME --region $REGION$PROFILE
-
- if [[ $? -ne 0 ]]
- then
- echo "Cloudformation stack update failed"
- break
+ if [[ -n "$PRODUCTS_INFO_FILE" ]]; then
+ echo "{\"PRODUCT_CODE\":\"${PRODUCT_CODE}\",\"PRODUCT_URL\":\"${PRODUCT_URL}\",\"SOURCE_URL\": \"${SOURCE_URL}\",\"DATASET_NAME\":\"${DATASET_NAME}\",\"DATASET_ARN\":\"${DATASET_ARN}\",\"DATASET_ID\":\"${DATASET_ID}\",\"PRODUCT_NAME\":\"${PRODUCT_NAME}\",\"PRODUCT_ID\":\"${PRODUCT_ID}\",\"SCHEDULE_CRON\":\"${SCHEDULE_CRON}\"}" >> "$PRODUCTS_INFO_FILE"
  fi
- echo "cloudformation stack update completed"
-}
-
-delete () {
- echo "Destroying the CloudFormation stack"
- aws cloudformation delete-stack --stack-name $CFN_STACK_NAME --region $REGION$PROFILE
-
- #check status of cloudformation stack delete action
- aws cloudformation wait stack-delete-complete --stack-name $CFN_STACK_NAME --region $REGION$PROFILE
- if [[ $? -eq 0 ]]
- then
- # Cloudformation stack deleted
- echo "CloudFormation stack successfully deleted"
- break
- else
- # Cloudformation stack deletion failed
- echo "Cloudformation stack deletion failed"
- exit 1
- fi
-}
 
-if [[ $DATASET_REVISION_STATUS == "true" ]]
-then
- echo "Dataset revision completed successfully"
- echo ""
+ echo "Uploading intial assets to asset_bucket for the first revision"
+ aws s3 cp product-description.md "s3:https://$ASSET_BUCKET/$DATASET_NAME/dataset/product-description.md"
+ aws s3 cp dataset-description.md "s3:https://$ASSET_BUCKET/$DATASET_NAME/dataset/dataset-description.md"
 
- while true; do
- echo "Do you want use this script to update the CloudFormation stack? If you enter 'n' your CloudFormation stack will be destroyed:"
- read -p "('y' to update / 'n' to destroy): " Y_N
- case $Y_N in
- [Yy]* ) update; exit;;
- [Nn]* ) delete; break;;
- * ) echo "Enter 'y' or 'n'.";;
- esac
- done
-
- echo "Manually create the ADX product and manually re-run the pre-processing CloudFormation template using the following params:"
+ REVISION_COMMAND="aws dataexchange create-data-set --asset-type "S3_SNAPSHOT" --description file:https://dataset-description.md --name \"${PRODUCT_NAME}\" --region $REGION --output json $PROFILE"
+ REVISION_OUTPUT=$(eval $REVISION_COMMAND)
+
+ echo "Manually, from ADX console, create the first revision of the dataset using 
+ product-description.md and dataset-description.md files and
+ then create the ADX product. 
+ Then manually re-run the pre-processing CloudFormation template using the following params:"
  echo ""
- echo "S3Bucket: $S3_BUCKET"
+ echo "AssetBucket: $ASSET_BUCKET"
+ echo "ManifestBucket: $MANIFEST_BUCKET"
+ echo "CustomerId: $CUSTOMER_ID"
  echo "DataSetName: $DATASET_NAME"
  echo "DataSetArn: $DATASET_ARN"
  echo "Region: $REGION"
- echo "S3Bucket: $S3_BUCKET"
+ echo "FIRST_REVISION: false"
  echo ""
  echo "For the ProductId param use the Product ID of the ADX product"
 
 else
- echo "Dataset revision failed"
- cat response.json
+ DATASET_ID=$(echo $DATASET_ARN | awk -F/ '{print $NF}')
+
+ echo "creating pre-processing cloudformation stack"
+ CFN_STACK_NAME="producer-${DATASET_NAME}-preprocessing"
+ aws cloudformation create-stack --stack-name "$CFN_STACK_NAME" --template-body file:https://pre-processing/pre-processing-cfn.yaml --parameters ParameterKey=RearcDataPlatformRoleArn,ParameterValue="$REARC_DATA_PLATFORM_ROLE_ARN" ParameterKey=RearcDataPlatformExternalId,ParameterValue="$REARC_DATA_PLATFORM_EXTERNAL_ID" ParameterKey=AssetBucket,ParameterValue="$ASSET_BUCKET" ParameterKey=ManifestBucket,ParameterValue="$MANIFEST_BUCKET" ParameterKey=CustomerId,ParameterValue="$CUSTOMER_ID" ParameterKey=DataSetName,ParameterValue="$DATASET_NAME" ParameterKey=DataSetArn,ParameterValue="$DATASET_ARN" ParameterKey=ProductId,ParameterValue="$PRODUCT_ID" ParameterKey=Region,ParameterValue="$REGION" ParameterKey=ScheduleCron,ParameterValue="'$SCHEDULE_CRON'" --region "$REGION" --capabilities "CAPABILITY_AUTO_EXPAND" "CAPABILITY_NAMED_IAM" "CAPABILITY_IAM" $PROFILE
+
+ echo "waiting for cloudformation stack creation to complete"
+ aws cloudformation wait stack-create-complete --stack-name "$CFN_STACK_NAME" --region "$REGION" $PROFILE
+
+ if [[ $? -ne 0 ]]; then
+ echo "Cloudformation stack creation failed"
+ exit 1
+ fi
+
+ echo "invoking the pre-processing lambda function to upload manifest file to manifest bucket"
+ LAMBDA_FUNCTION_NAME="source-for-${DATASET_NAME}"
+ # AWS CLI version 2 changes require explicitly declairing `--cli-binary-format raw-in-base64-out` for the format of the `--payload`
+ aws lambda invoke --function-name "$LAMBDA_FUNCTION_NAME" --invocation-type "RequestResponse" --payload '{ "test": "event" }' response.json --cli-binary-format raw-in-base64-out --region "$REGION" --query 'StatusCode' --output text $PROFILE
+
+ if [[ $? -ne 0 ]]; then
+ echo "Lambda invocation failed"
+ exit 1
+ fi
+
 fi
diff --git a/migrate.sh b/migrate.sh
@@ -0,0 +1,38 @@
+
+#### You only need to run this part ONCE
+
+# clone the template
+git clone https://github.com/rearc-data/adx-product-rearc-data-platform-template.git
+
+# remove extra files / folders
+cd temple_folder
+rm *.md
+rm -rf .git
+rm pre-processing/pre-processing-code/source_data.py
+
+cd ..
+
+
+##### Now run the following commands FOR EACH PRODUCT
+# Step 1:
+# Go to the cloudformation console, find the stack for the product you want to migrate, copy the parameters section, delete the stack
+
+# Step 2: once the stack is deleted:
+git clone https://github.com/rearc-data/fred-privately-owned-housing.git
+cd fred-privately-owned-housing
+
+git checkout -b rdp
+cp -a ../adx-product-rearc-data-platform-template/. ./
+
+# in run.sh
+# Step 3: Using the parameters you have copied from the cloudformation stack, 
+# and your AWS profile name, update the variable names in run.sh
+
+# Step 4: in sorce.py
+# replace: os.getenv('S3_BUCKET') => os.getenv('ASSET_BUCKET')
+# replace: os.environ['DATA_SET_NAME'] => os.environ['DATASET_NAME']
+
+
+# Step 5: make sure variabke names are correct in run.sh, then run it
+chmod a+x run.sh
+./run.sh