DRL学習プログラムのVM再起動対応強化の準備中
前回記事で、VM再起動のDRL学習プログラムへの影響を書きましたが、この解決に向けて、全ThreadのPseudo-count情報をsave/loadする機能を作成中です。
そのためには、Thread間での同期処理が必要であり、これを誤ると後で発見するのが大変なので、まずはテストプログラムを作成して試してみました。結果、上手く動いているようですので、DRL学習プログラムに作り込む予定です。
ちなみに、下記がテストプログラムです。
import threading import signal import random import time th0_ready = threading.Event() # Thread0 is ready all_ready = threading.Event() # All thread is ready th0_finish = threading.Event() # Thread0 finished save num_ready = 0 # Number of ready thread th0_ready.clear() all_ready.clear() th0_finish.clear() global_t = 0 save_time_interval = 16 max_global_t = 640 next_save_steps = save_time_interval stop_requested = False r = [0 for i in range(8)] def train_function(i): global global_t global num_ready global next_save_steps indent=" |" * i while True: if global_t > next_save_steps or \ global_t > max_global_t or \ stop_requested: print("{}{}".format(indent, " START ")) if i == 0: r[i] += 1 all_ready.clear() th0_finish.clear() num_ready = 1 print("{}{}".format(indent, "th0_r.S")) th0_ready.set() print("{}{}".format(indent, "all_r.W")) all_ready.wait() print("{}{}".format(indent, "all_rdy")) next_save_steps += save_time_interval th0_ready.clear() print("{}{}".format(indent, "sleep ")) time.sleep(random.random() * 0.5) print("global_t={}, r={}".format(global_t, r)) print("{}{}".format(indent, "th0_f.S")) th0_finish.set() else: print("{}{}".format(indent, "th0_r.W")) th0_ready.wait() print("{}{}".format(indent, "th0_rdy")) r[i] += 1 num_ready += 1 if num_ready == 8: print("{}{}".format(indent, "all_r.S")) all_ready.set() print("{}{}".format(indent, "th0_f.W")) th0_finish.wait() print("{}{}".format(indent, " END ")) if global_t > max_global_t or \ stop_requested: break t = random.random() * 0.01 + 1 time.sleep(t) diff_global_t = t global_t += diff_global_t def signal_handler(signal, frame): global stop_requested print('You pressed Ctrl+C!') stop_requested = True train_threads = [] for i in range(8): train_threads.append(threading.Thread(target=train_function, args=(i,))) signal.signal(signal.SIGINT, signal_handler) for t in train_threads: t.start() print('Press Ctrl+C to stop') for t in train_threads: t.join()
上の実行結果は下記のようになり、上手く動いているようです。
Press Ctrl+C to stop | | | | | | START | | | | | |th0_r.W START th0_r.S all_r.W | | | | | |th0_rdy | | | | | |th0_f.W | | | START | | |th0_r.W | | |th0_rdy | | |th0_f.W | | | | | START | | | | |th0_r.W | | | | |th0_rdy | | | | |th0_f.W | START |th0_r.W |th0_rdy |th0_f.W | | | | | | | START | | | | | | |th0_r.W | | | | | | |th0_rdy | | | | | | |th0_f.W | | START | |th0_r.W | |th0_rdy | |th0_f.W | | | | START | | | |th0_r.W | | | |th0_rdy | | | |all_r.S | | | |th0_f.W all_rdy sleep global_t=23.092924113512346, r=[1, 1, 1, 1, 1, 1, 1, 1] th0_f.S END | | | END | END | | | | END | | | | | | | END | | | | | END | | | | | | END | | END | START |th0_r.W | | | | | | | START | | | | | | |th0_r.W | | | | | START | | | | |th0_r.W | | | | START | | | |th0_r.W START th0_r.S all_r.W |th0_rdy | | | |th0_rdy | | | | | | |th0_rdy |th0_f.W | | | | |th0_rdy | | | | |th0_f.W | | | | | | |th0_f.W | | | |th0_f.W | | | START | | |th0_r.W | | |th0_rdy | | |th0_f.W | | | | | | START | | | | | |th0_r.W | | | | | |th0_rdy | | | | | |th0_f.W | | START | |th0_r.W | |th0_rdy | |all_r.S | |th0_f.W all_rdy sleep global_t=39.164206668006784, r=[2, 2, 2, 2, 2, 2, 2, 2] th0_f.S END | END | | | | | | | END | | | | END | | | | | | END | | | END | | | | | END | | END ^CYou pressed Ctrl+C! | | START | |th0_r.W | | | | | | | START | | | | | | |th0_r.W | | | | | | START | | | | | |th0_r.W | | | | | START | | | | |th0_r.W | | | | START | | | |th0_r.W START th0_r.S all_r.W | | | | | | |th0_rdy | | | | | |th0_rdy | | | |th0_rdy | | | | | | |th0_f.W | | | | | |th0_f.W | |th0_rdy | | | | |th0_rdy | | | |th0_f.W | |th0_f.W | | | | |th0_f.W | | | START | | |th0_r.W | | |th0_rdy | | |th0_f.W | START |th0_r.W |th0_rdy |all_r.S |th0_f.W all_rdy sleep global_t=55.24387642472125, r=[3, 3, 3, 3, 3, 3, 3, 3] th0_f.S END | | | | END | | END | | | | | END | | | | | | | END | | | END | END | | | | | | END