@@ -63,6 +63,7 @@ get_storage_commands() {
6363 fetch_cmd=( " gsutil" " cp" )
6464 storage_type=" gs"
6565 gsutil_auth
66+ find_object=" find_object_gs"
6667 ;;
6768 s3://* )
6869 echo " >> Storage type: aws s3"
@@ -72,6 +73,7 @@ get_storage_commands() {
7273 ls_cmd=( " aws" " s3" " ls" " ${AWS_S3_ADDITIONAL_ARGS} " )
7374 fetch_cmd=( " aws" " s3" " cp" " ${AWS_S3_ADDITIONAL_ARGS} " )
7475 storage_type=" s3"
76+ find_object=" find_object_s3"
7577 ;;
7678 file://* |/* |./* )
7779 echo " >> Storage type: file"
@@ -80,6 +82,7 @@ get_storage_commands() {
8082 fetch_cmd=( " cat" )
8183 source=" ${source# file: \/\/ } "
8284 storage_type=" file"
85+ find_object=" find_object_file"
8386 ;;
8487 * )
8588 echoerr " Unknown storage type"
@@ -136,3 +139,108 @@ gsutil_auth() {
136139 " service_account = default" > /etc/boto.cfg
137140 fi
138141}
142+
143+ # helper functions
144+ function get_filename_from_object_path() {
145+ # Returns just the filename portion of the full object path
146+ echo " ${1} " | sed -E -e ' s/.*[\/ ]([^\/]*)$/\1/'
147+ }
148+
149+ function get_basename_from_object_path() {
150+ # Returns just the bucketname / base path
151+ echo " ${1} " | sed ' s/\(file\|s3\|gs\):\/\/\([^\/]\+\)\/.*/\1:\/\/\2\//'
152+ }
153+
154+ function get_timestamp_from_object_path() {
155+ # Returns just the timestamp portion of the full object path 2-14 digits
156+ echo " ${1} " | sed -n ' s/.*\/\([0-9]\{2,14\}\).*/\1/p; t; q;'
157+ }
158+
159+ function check_object_exists() {
160+ if [[ $( eval " ${ls_cmd[@]} " " ${1} " ) ]]; then
161+ return 0
162+ else
163+ echoerr " Error file not found"
164+ return 1
165+ fi
166+ }
167+
168+ function find_object_gs {
169+ # find the object
170+ # the following are are all valid
171+ # gs://mybucket/20230413000003/my_database.sql.lz4
172+ # gs://mybucket/20230413000003/ my_database
173+ # gs://mybucket/ my_database
174+ # gs://mybucket/20230413 my_database
175+
176+ source=" ${1} "
177+ database=" ${2:- } "
178+ timestamp=" $( get_timestamp_from_object_path " ${source} " ) "
179+ base=" $( get_basename_from_object_path " ${source} " ) "
180+
181+ if [[ " ${timestamp} " == " " ]]; then
182+ # no timestamp in the path, find the latest
183+ timestamp=" $( eval " ${ls_cmd[@]} " " ${source} " | sed -E -e ' /[0-9]{14}/!d' -e ' s/.*([0-9]{14})\/$/\1/' | sort | tail -n1) "
184+ full_path=" $( eval " ${ls_cmd[@]} " " ${source}${timestamp} /" | grep " /${database} [\.\-]" ) "
185+ else
186+ # has timestamp, either fully qualified, or needs expanding
187+ if [[ $source =~ [0-9]{14}/${database} ]]; then
188+ # should be complete path
189+ full_path=" ${source} "
190+ elif [[ $source =~ [0-9]{14} ]]; then
191+ # complete timestamp
192+ full_path=" $( eval " ${ls_cmd[@]} " " ${source} " | grep " /${database} [\.\-]" ) "
193+ else
194+ # partial timestamp. search for matching object path
195+ full_path=" $( eval " ${ls_cmd[@]} " " ${base}${timestamp} */" | grep " /${database} [\.\-]" ) "
196+ fi
197+ fi
198+ check_object_exists " ${full_path} " || { echoerr " Error file not found" ; exit 1; }
199+ echo " ${full_path} "
200+ }
201+
202+
203+ function find_object_s3 {
204+ # find the object
205+ # the following are are all valid
206+ # s3://mybucket/20230413000003/my_database.sql.lz4
207+ # s3://mybucket/20230413000003/ my_database
208+ # s3://mybucket/ my_database
209+ # s3://mybucket/20230413 my_database
210+
211+ source=" ${1} "
212+ database=" ${2:- } "
213+ timestamp=" $( get_timestamp_from_object_path " ${source} " ) "
214+ base=" $( get_basename_from_object_path " ${source} " ) "
215+
216+ if [[ " ${timestamp} " == " " ]]; then
217+ # no timestamp in the path, find the latest
218+ timestamp=" $( eval " ${ls_cmd[@]} " " ${base} " | sed -E -e ' /[0-9]{14}/!d' -e ' s/.*([0-9]{14})\/$/\1/' | sort | tail -n1) "
219+ file=" $( eval " ${ls_cmd[@]} " " ${base}${timestamp} /" | sed -E -e ' s/.*[\/ ]([^\/]*)$/\1/' | grep " ^${database} [\.\-]" ) "
220+ full_path=" ${base}${timestamp} /${file} "
221+ else
222+ # has timestamp, either fully qualified, or needs expanding
223+ if [[ $source =~ [0-9]{14}/${database} ]]; then
224+ # should be complete path
225+ full_path=" ${source} "
226+ elif [[ $source =~ [0-9]{14} ]]; then
227+ # complete timestamp
228+ file=" $( eval " ${ls_cmd[@]} " " ${source} " | sed -E -e ' s/.*[\/ ]([^\/]*)$/\1/' | grep " ^${database} [\.\-]" ) "
229+ full_path=" ${source}${file} "
230+ else
231+ # partial timestamp. search for matching object path
232+ timestamp=" $( eval " ${ls_cmd[@]} " " ${base} " | sed -E -e ' /[0-9]{14}/!d' -e ' s/.*([0-9]{14})\/$/\1/' | grep " ${timestamp} " ) "
233+ timestamp_count=$( wc -l <<< " ${timestamp}" )
234+ [[ " ${timestamp_count} " -gt 1 ]] && { echoerr " Error too many items found. Timestamp is not distinct." ; exit 1; }
235+ file=" $( eval " ${ls_cmd[@]} " " ${base}${timestamp} /" | sed -E -e ' s/.*[\/ ]([^\/]*)$/\1/' | grep " ^${database} [\.\-]" ) "
236+ full_path=" ${base}${timestamp} /${file} "
237+ fi
238+ fi
239+ check_object_exists " ${full_path} " || { echoerr " Error file not found" ; exit 1; }
240+ echo " ${full_path} "
241+ }
242+
243+ function find_object_file {
244+ echoerr " find_object_file not implemented"
245+ exit 1
246+ }
0 commit comments