Skip to content

Commit 0826d94

Browse files
implement external storage cleanup with subfolding
1 parent 8f4e8f9 commit 0826d94

File tree

5 files changed

+53
-16
lines changed

5 files changed

+53
-16
lines changed

datajoint/erd.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ class ERD:
5050
"""
5151

5252
def __init__(self, *args, **kwargs):
53-
warnings.warn('ERD functionality depends on matplotlib and pygraphviz. Please install both of these '
54-
'libraries to enable the ERD feature.')
53+
warnings.warn('ERD functionality depends on matplotlib, networkx, and pygraphviz. '
54+
'Please install both of these libraries to enable the ERD feature.')
5555
else:
5656
class ERD(nx.DiGraph):
5757
"""

datajoint/external.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import os
2-
from tqdm import tqdm
2+
import itertools
33
from .settings import config
44
from .errors import DataJointError
55
from .hash import long_hash
@@ -165,22 +165,35 @@ def delete_garbage(self):
165165
for ref in self.references) or "TRUE")
166166
print('Deleted %d items' % self.connection.query("SELECT ROW_COUNT()").fetchone()[0])
167167

168-
def clean_store(self, store, display_progress=True):
168+
def clean_store(self, store, verbose=True):
169169
"""
170170
Clean unused data in an external storage repository from unused blobs.
171171
This must be performed after delete_garbage during low-usage periods to reduce risks of data loss.
172172
"""
173173
spec = config.get_store_spec(store)
174-
progress = tqdm if display_progress else lambda x: x
175-
in_use = set(self.fetch('hash'))
174+
in_use = set(x for x in (self & '`hash` LIKE "%%{store}"'.format(store=store)).fetch('hash'))
176175
if spec['protocol'] == 'file':
177-
for folder, _, files in progress(os.walk(os.path.join(spec['location'], self.database))):
178-
for f in files:
179-
if f not in in_use:
176+
count = itertools.count()
177+
print('Deleting...')
178+
deleted_folders = set()
179+
for folder, dirs, files in os.walk(os.path.join(spec['location'], self.database), topdown=False):
180+
if dirs and files:
181+
raise DataJointError('Invalid repository with files in non-terminal folder %s' % folder)
182+
dirs = set(d for d in dirs if os.path.join(folder, d) not in deleted_folders)
183+
if not dirs:
184+
files_not_in_use = [f for f in files if f not in in_use]
185+
for f in files_not_in_use:
180186
filename = os.path.join(folder, f)
187+
next(count)
188+
if verbose:
189+
print(filename)
181190
os.remove(filename)
191+
if len(files_not_in_use) == len(files):
192+
os.rmdir(folder)
193+
deleted_folders.add(folder)
194+
print('Deleted %d objects' % next(count))
182195
elif spec['protocol'] == 's3':
183196
try:
184-
s3.Folder(database=self.database, **spec).clean(in_use)
197+
failed_deletes = s3.Folder(database=self.database, **spec).clean(in_use, verbose=verbose)
185198
except TypeError:
186199
raise DataJointError('External store {store} configuration is incomplete.'.format(store=store))

datajoint/s3.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,34 @@ def get(self, blob_hash):
3030
except minio.error.NoSuchKey:
3131
return None
3232

33-
def clean(self, exclude, max_count=None):
33+
def clean(self, exclude, max_count=None, verbose=False):
3434
"""
3535
Delete all objects except for those in the exclude
3636
:param exclude: a list of blob_hashes to skip.
3737
:param max_count: maximum number of object to delete
38-
:return: generator of objects that failed to delete
38+
:param verbose: If True, print deleted objects
39+
:return: list of objects that failed to delete
3940
"""
40-
return self.client.remove_objects(self.bucket, itertools.islice(
41-
(x.object_name for x in self.client.list_objects(self.bucket, self.remote_path + '/')
42-
if x not in exclude), max_count))
41+
count = itertools.count()
42+
if verbose:
43+
def out(name):
44+
next(count)
45+
print(name)
46+
return name
47+
else:
48+
def out(name):
49+
next(count)
50+
return name
51+
52+
if verbose:
53+
print('Deleting...')
54+
55+
names = (out(x.object_name)
56+
for x in self.client.list_objects(self.bucket, self.remote_path + '/', recursive=True)
57+
if x.object_name.split('/')[-1] not in exclude)
58+
59+
failed_deletes = list(
60+
self.client.remove_objects(self.bucket, itertools.islice(names, max_count)))
61+
62+
print('Deleted: %i S3 objects' % next(count))
63+
return failed_deletes

datajoint/schema.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ def jobs(self):
228228
return self._jobs
229229

230230
@property
231-
def external_table(self):
231+
def external(self):
232232
"""
233233
schema.external provides a view of the external hash table for the schema
234234
:return: external table
@@ -237,6 +237,8 @@ def external_table(self):
237237
self._external = ExternalTable(self.connection, self.database)
238238
return self._external
239239

240+
external_table = external # for backward compatibility to pre-0.12.0
241+
240242

241243
def create_virtual_module(module_name, schema_name, create_schema=False, create_tables=False, connection=None):
242244
"""

tests/test_erd.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def test_dependencies():
3434

3535
@staticmethod
3636
def test_erd():
37+
assert_true(dj.erd.erd_active, 'Failed to import networkx and pydot')
3738
erd = dj.ERD(schema, context=namespace)
3839
graph = erd._make_graph()
3940
assert_true(set(cls.__name__ for cls in (A, B, D, E, L)).issubset(graph.nodes()))

0 commit comments

Comments
 (0)