Skip to content

Commit 66492d1

Browse files
felix5572njzjz
andauthored
add support for new dpdispatcher (#449)
* fix show error * use new dpdispatcher * update new dpdispatcher * update for new dpdispatcher * support dpdispatcher * fix bug when using new dpdispatcher * update for new dpdispatcher init_bulk * update examples for new dpdispatcher * fix install * fix typo use LooseVersion Co-authored-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu> * update LooseVersion * update dependency dpdispatcher min version * fix example * update min dpdispatcher version * update make_submission change trans_comm_data to forward_common_files * delete unused compatibility code; change mdata to mdata_machine in make_submission * update group_size for new dpdispatcher * fix bug change indents to spaces * update docs for new dpdispatcher * updata args check for key local_root; update examples/CH4-refact-dpdispatcher; clean dupilicated files * update dpdispatcher for auto_test * update warnings for old dpdispatcher * update docs; explain dpgen new dpdispatcher more clearly * fix bug; LooseVersion(api_version) == "1.0" is handled now * more clear new dpdispatcher introductions * update README.md better docs format * update links * update docs for new dpdispatcher * detailed docs about new dpdispatcher * update docs for new dpdispatcher * update docs about dpgen new dpdispatcher * update docs for new dpdispatcher * update soft link Co-authored-by: felix5572 <yfb222333@gmail.com> Co-authored-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
1 parent e89d7fc commit 66492d1

File tree

21 files changed

+861
-318
lines changed

21 files changed

+861
-318
lines changed

README.md

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1094,6 +1094,110 @@ Here `pick_data` is the data to simplify and currently only supports `MultiSyste
10941094

10951095

10961096
## Set up machine
1097+
### new dpdispatcher update note
1098+
dpdispatcher Update Note:
1099+
dpdispatcher has updated and the api of `machine.json` is changed.
1100+
dpgen will use new dpdispatcher if the key `api_version` in dpgen's `machine.json`'s value is equal or large than `1.0`.
1101+
1102+
And dpgen will use old dpdispatcher if the key `api_version` is not specified in `machine.json` or the `api_version` is smaller than `1.0`.
1103+
This gurantees that the old `machine.json`s still work.
1104+
1105+
And for now dpdispatcher is maintained on a seperate repo.
1106+
The repo link: https://github.com/deepmodeling/dpdispatcher
1107+
1108+
The api of new dpdispatcher is close to old one except for a few changes.
1109+
1110+
The new `machine.json` examples can be seen [here](https://docs.deepmodeling.org/projects/dpdispatcher/en/latest/getting-started.html)
1111+
1112+
And Here are the explanations of the keys in [machine](https://docs.deepmodeling.org/projects/dpdispatcher/en/latest/machine.html)
1113+
[resources](https://docs.deepmodeling.org/projects/dpdispatcher/en/latest/resources.html).
1114+
1115+
1116+
Here is a example `machine.json` for dpgen's new dpdispatcher.
1117+
Please check the [documents](https://deepmd.readthedocs.io/projects/dpdispatcher/en/latest/) for more information about new dpdispatcher.
1118+
1119+
an example of new dpgen's machine.json
1120+
```json
1121+
{
1122+
"api_version": "1.0",
1123+
"train": [
1124+
{
1125+
"command": "dp",
1126+
"machine": {
1127+
"batch_type": "PBS",
1128+
"context_type": "SSHContext",
1129+
"local_root": "./",
1130+
"remote_root": "/home/user1234/work_path_dpdispatcher_test",
1131+
"remote_profile": {
1132+
"hostname": "39.xxx.xx.xx",
1133+
"username": "user1234"
1134+
}
1135+
},
1136+
"resources": {
1137+
"number_node": 1,
1138+
"cpu_per_node": 4,
1139+
"gpu_per_node": 1,
1140+
"queue_name": "T4_4_15",
1141+
"group_size": 1,
1142+
"custom_flags":["#SBATCH --mem=32G"],
1143+
"strategy": {"if_cuda_multi_devices": true},
1144+
"para_deg": 3,
1145+
"source_list": ["/home/user1234/deepmd.1.2.4.env"]
1146+
}
1147+
}
1148+
],
1149+
"model_devi":[
1150+
{
1151+
"command": "lmp",
1152+
"machine":{
1153+
"batch_type": "PBS",
1154+
"context_type": "SSHContext",
1155+
"local_root": "./",
1156+
"remote_root": "/home/user1234/work_path_dpdispatcher_test",
1157+
"remote_profile": {
1158+
"hostname": "39.xxx.xx.xx",
1159+
"username": "user1234"
1160+
}
1161+
},
1162+
"resources": {
1163+
"number_node": 1,
1164+
"cpu_per_node": 4,
1165+
"gpu_per_node": 1,
1166+
"queue_name": "T4_4_15",
1167+
"group_size": 5,
1168+
"source_list": ["/home/user1234/deepmd.1.2.4.env"]
1169+
}
1170+
}
1171+
],
1172+
"fp":[
1173+
{
1174+
"command": "vasp_std",
1175+
"machine":{
1176+
"batch_type": "PBS",
1177+
"context_type": "SSHContext",
1178+
"local_root": "./",
1179+
"remote_root": "/home/user1234/work_path_dpdispatcher_test",
1180+
"remote_profile": {
1181+
"hostname": "39.xxx.xx.xx",
1182+
"username": "user1234"
1183+
}
1184+
},
1185+
"resources": {
1186+
"number_node": 1,
1187+
"cpu_per_node": 32,
1188+
"gpu_per_node": 0,
1189+
"queue_name": "G_32_128",
1190+
"group_size": 1,
1191+
"source_list": ["~/vasp.env"]
1192+
}
1193+
}
1194+
]
1195+
}
1196+
```
1197+
note1: the key "local_root" in dpgen's machine.json is always `./`
1198+
1199+
### old dpdispatcher
1200+
10971201
When switching into a new machine, you may modifying the `MACHINE`, according to the actual circumstance. Once you have finished, the `MACHINE` can be re-used for any DP-GEN tasks without any extra efforts.
10981202

10991203
An example for `MACHINE` is:

dpgen/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ def info():
4444
print('%10s %10s %s' % (modui, mm.__version__, mm.__path__[0]))
4545
except ImportError:
4646
print('%10s %10s Not Found' % (modui, ''))
47+
except AttributeError:
48+
print('%10s %10s unknown version or path' %(modui, ''))
4749
print()
4850

4951
# reference

dpgen/auto_test/common_equi.py

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import glob
22
import os
3-
3+
import warnings
44
from monty.serialization import dumpfn
55

66
import dpgen.auto_test.lib.crys as crys
@@ -10,6 +10,8 @@
1010
from dpgen.auto_test.mpdb import get_structure
1111
from dpgen.dispatcher.Dispatcher import make_dispatcher
1212
from dpgen.remote.decide_machine import decide_fp_machine, decide_model_devi_machine
13+
from distutils.version import LooseVersion
14+
from dpgen.dispatcher.Dispatcher import make_submission
1315

1416
lammps_task_type = ['deepmd', 'meam', 'eam_fs', 'eam_alloy']
1517

@@ -149,23 +151,44 @@ def run_equi(confs,
149151
if len(run_tasks) == 0:
150152
return
151153
else:
154+
# if LooseVersion()
152155
run_tasks = [os.path.basename(ii) for ii in all_task]
153156
machine, resources, command, group_size = util.get_machine_info(mdata, inter_type)
154157
print('%d tasks will be submited '%len(run_tasks))
155158
for ii in range(len(work_path_list)):
156159
work_path = work_path_list[ii]
157160
disp = make_dispatcher(machine, resources, work_path, [run_tasks[ii]], group_size)
158161
print("%s --> Runing... "%(work_path))
162+
163+
api_version = mdata.get('api_version', '0.9')
164+
if LooseVersion(api_version) < LooseVersion('1.0'):
165+
warnings.warn(f"the dpdispatcher will be updated to new version."
166+
f"And the interface may be changed. Please check the documents for more details")
159167
disp.run_jobs(resources,
160-
command,
161-
work_path,
162-
[run_tasks[ii]],
163-
group_size,
164-
forward_common_files,
165-
forward_files,
166-
backward_files,
167-
outlog='outlog',
168-
errlog='errlog')
168+
command,
169+
work_path,
170+
[run_tasks[ii]],
171+
group_size,
172+
forward_common_files,
173+
forward_files,
174+
backward_files,
175+
outlog='outlog',
176+
errlog='errlog')
177+
elif LooseVersion(api_version) >= LooseVersion('1.0'):
178+
submission = make_submission(
179+
mdata_machine=machine,
180+
mdata_resource=resources,
181+
commands=[command],
182+
work_path=work_path,
183+
run_tasks=run_tasks,
184+
group_size=group_size,
185+
forward_common_files=forward_common_files,
186+
forward_files=forward_files,
187+
backward_files=backward_files,
188+
outlog = 'outlog',
189+
errlog = 'errlog'
190+
)
191+
submission.run_submission()
169192

170193

171194
def post_equi(confs, inter_param):

dpgen/auto_test/common_prop.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1+
from distutils.version import LooseVersion
12
import glob
23
import os
4+
import warnings
35
from multiprocessing import Pool
46
import dpgen.auto_test.lib.util as util
57
from dpgen import dlog
@@ -12,6 +14,7 @@
1214
from dpgen.auto_test.calculator import make_calculator
1315
from dpgen.dispatcher.Dispatcher import make_dispatcher
1416
from dpgen.remote.decide_machine import decide_fp_machine, decide_model_devi_machine
17+
from dpgen.dispatcher.Dispatcher import make_submission
1518

1619
lammps_task_type = ['deepmd', 'meam', 'eam_fs', 'eam_alloy']
1720

@@ -196,7 +199,11 @@ def worker(work_path,
196199
run_tasks = [os.path.basename(ii) for ii in all_task]
197200
machine, resources, command, group_size = util.get_machine_info(mdata, inter_type)
198201
disp = make_dispatcher(machine, resources, work_path, run_tasks, group_size)
199-
disp.run_jobs(resources,
202+
api_version = mdata.get('api_version', '0.9')
203+
if LooseVersion(api_version) < LooseVersion('1.0'):
204+
warnings.warn(f"the dpdispatcher will be updated to new version."
205+
f"And the interface may be changed. Please check the documents for more details")
206+
disp.run_jobs(resources,
200207
command,
201208
work_path,
202209
run_tasks,
@@ -206,7 +213,21 @@ def worker(work_path,
206213
backward_files,
207214
outlog='outlog',
208215
errlog='errlog')
209-
216+
elif LooseVersion(api_version) >= LooseVersion('1.0'):
217+
submission = make_submission(
218+
mdata_machine=machine,
219+
mdata_resource=resources,
220+
commands=[command],
221+
work_path=work_path,
222+
run_tasks=run_tasks,
223+
group_size=group_size,
224+
forward_common_files=forward_common_files,
225+
forward_files=forward_files,
226+
backward_files=backward_files,
227+
outlog = 'outlog',
228+
errlog = 'errlog'
229+
)
230+
submission.run_submission()
210231

211232
def post_property(confs,
212233
# inter_param,

dpgen/data/gen.py

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,13 @@
2020
import dpgen.data.tools.bcc as bcc
2121
import dpgen.data.tools.diamond as diamond
2222
import dpgen.data.tools.sc as sc
23+
from distutils.version import LooseVersion
2324
from dpgen.generator.lib.vasp import incar_upper
2425
from pymatgen.core import Structure
2526
from pymatgen.io.vasp import Incar
2627
from dpgen.remote.decide_machine import decide_fp_machine
2728
from dpgen import ROOT_PATH
28-
from dpgen.dispatcher.Dispatcher import Dispatcher, make_dispatcher
29+
from dpgen.dispatcher.Dispatcher import Dispatcher, make_dispatcher, make_submission
2930

3031

3132

@@ -581,9 +582,13 @@ def run_vasp_relax(jdata, mdata):
581582
# if not _vasp_check_fin(ii):
582583
# relax_run_tasks.append(ii)
583584
run_tasks = [os.path.basename(ii) for ii in relax_run_tasks]
584-
dispatcher = make_dispatcher(mdata['fp_machine'], mdata['fp_resources'], work_dir, run_tasks, fp_group_size)
585-
#dlog.info(run_tasks)
586-
dispatcher.run_jobs(fp_resources,
585+
586+
api_version = mdata.get('api_version', '0.9')
587+
if LooseVersion(api_version) < LooseVersion('1.0'):
588+
warnings.warn(f"the dpdispatcher will be updated to new version."
589+
f"And the interface may be changed. Please check the documents for more details")
590+
dispatcher = make_dispatcher(mdata['fp_machine'], mdata['fp_resources'], work_dir, run_tasks, fp_group_size)
591+
dispatcher.run_jobs(fp_resources,
587592
[fp_command],
588593
work_dir,
589594
run_tasks,
@@ -592,6 +597,22 @@ def run_vasp_relax(jdata, mdata):
592597
forward_files,
593598
backward_files)
594599

600+
elif LooseVersion(api_version) >= LooseVersion('1.0'):
601+
submission = make_submission(
602+
mdata['fp_machine'],
603+
mdata['fp_resources'],
604+
commands=[fp_command],
605+
work_path=work_dir,
606+
run_tasks=run_tasks,
607+
group_size=fp_group_size,
608+
forward_common_files=forward_common_files,
609+
forward_files=forward_files,
610+
backward_files=backward_files,
611+
outlog = 'fp.log',
612+
errlog = 'fp.log')
613+
submission.run_submission()
614+
615+
595616
def run_vasp_md(jdata, mdata):
596617
fp_command = mdata['fp_command']
597618
fp_group_size = mdata['fp_group_size']
@@ -627,8 +648,12 @@ def run_vasp_md(jdata, mdata):
627648
run_tasks = [ii.replace(work_dir+"/", "") for ii in md_run_tasks]
628649
#dlog.info("md_work_dir", work_dir)
629650
#dlog.info("run_tasks",run_tasks)
630-
dispatcher = make_dispatcher(mdata['fp_machine'], mdata['fp_resources'], work_dir, run_tasks, fp_group_size)
631-
dispatcher.run_jobs(fp_resources,
651+
api_version = mdata.get('api_version', '0.9')
652+
if LooseVersion(api_version) < LooseVersion('1.0'):
653+
warnings.warn(f"the dpdispatcher will be updated to new version."
654+
f"And the interface may be changed. Please check the documents for more details")
655+
dispatcher = make_dispatcher(mdata['fp_machine'], mdata['fp_resources'], work_dir, run_tasks, fp_group_size)
656+
dispatcher.run_jobs(fp_resources,
632657
[fp_command],
633658
work_dir,
634659
run_tasks,
@@ -637,6 +662,21 @@ def run_vasp_md(jdata, mdata):
637662
forward_files,
638663
backward_files)
639664

665+
elif LooseVersion(api_version) >= LooseVersion('1.0'):
666+
submission = make_submission(
667+
mdata['fp_machine'],
668+
mdata['fp_resources'],
669+
commands=[fp_command],
670+
work_path=work_dir,
671+
run_tasks=run_tasks,
672+
group_size=fp_group_size,
673+
forward_common_files=forward_common_files,
674+
forward_files=forward_files,
675+
backward_files=backward_files,
676+
outlog = 'fp.log',
677+
errlog = 'fp.log')
678+
submission.run_submission()
679+
640680
def gen_init_bulk(args) :
641681
try:
642682
import ruamel

dpgen/dispatcher/Dispatcher.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
from distutils.version import LooseVersion
12
import os,sys,time,random,json,glob
3+
from dpdispatcher import Task, Submission, Resources, Machine
24
from dpgen.dispatcher.LocalContext import LocalSession
35
from dpgen.dispatcher.LocalContext import LocalContext
46
from dpgen.dispatcher.LazyLocalContext import LazyLocalContext
@@ -12,6 +14,8 @@
1214
from dpgen.dispatcher.JobStatus import JobStatus
1315
from dpgen import dlog
1416
from hashlib import sha1
17+
# import dargs
18+
from dargs.dargs import Argument
1519

1620
def _split_tasks(tasks,
1721
group_size):
@@ -338,3 +342,40 @@ def make_dispatcher(mdata, mdata_resource=None, work_path=None, run_tasks=None,
338342
context_type = 'lazy-local'
339343
disp = Dispatcher(mdata, context_type=context_type, batch_type=batch_type)
340344
return disp
345+
346+
def make_submission(mdata_machine, mdata_resources, commands, work_path, run_tasks, group_size,
347+
forward_common_files, forward_files, backward_files, outlog, errlog):
348+
349+
machine = Machine.load_from_dict(mdata_machine)
350+
resources = Resources.load_from_dict(mdata_resources)
351+
352+
if resources['local_root'] != './':
353+
raise RuntimeError(f"local_root must be './' in dpgen's machine.json.")
354+
355+
command = "&&".join(commands)
356+
357+
task_list = []
358+
for ii in run_tasks:
359+
task = Task(
360+
command=command,
361+
task_work_path=ii,
362+
forward_files=forward_files,
363+
backward_files=backward_files,
364+
outlog=outlog,
365+
errlog=errlog
366+
)
367+
task_list.append(task)
368+
369+
submission = Submission(
370+
work_base=work_path,
371+
machine=machine,
372+
resources=resources,
373+
task_list=task_list,
374+
forward_common_files=forward_common_files,
375+
backward_common_files=[]
376+
)
377+
return submission
378+
379+
380+
381+

0 commit comments

Comments
 (0)