Commit 517795c8 authored by Fardale's avatar Fardale

change to dynamic resource handling

the server query each node to find a node with enough
resources
parent c35bccdc
...@@ -5,87 +5,140 @@ module SHashtbl = CCHashtbl.Make (CCString) ...@@ -5,87 +5,140 @@ module SHashtbl = CCHashtbl.Make (CCString)
(* TODO: store currently running jobs and check from time (* TODO: store currently running jobs and check from time
* to time that node are still alive *) * to time that node are still alive *)
let nodes : (node * computation list) SHashtbl.t = SHashtbl.create 10 let nodes : (int * computation list) SHashtbl.t = SHashtbl.create 10
let jobs : job CCDeque.t = CCDeque.create () let jobs : job CCDeque.t = CCDeque.create ()
let jobs_id = ref 0 let jobs_id = ref 0
let string_of_ret_code = function let string_of_ret_code = function
| `WEXITED i -> | `WEXITED i -> Printf.sprintf "WEXITED %i" i
Printf.sprintf "WEXITED %i" i | `WSIGNALED s -> Printf.sprintf "WSIGNALED %i" s
| `WSIGNALED s -> | `WSTOPPED s -> Printf.sprintf "WSTOPPED %i" s
Printf.sprintf "WSIGNALED %i" s
| `WSTOPPED s ->
Printf.sprintf "WSTOPPED %i" s
let string_of_sockaddr = function let string_of_sockaddr = function
| Unix.ADDR_UNIX s -> | Unix.ADDR_UNIX s -> s
s | Unix.ADDR_INET (ip, _) -> Unix.string_of_inet_addr ip
| Unix.ADDR_INET (ip, _) ->
Unix.string_of_inet_addr ip
let end_job id sockaddr = let end_job id sockaddr =
let n, l = SHashtbl.find nodes (string_of_sockaddr sockaddr) in let port, l = SHashtbl.find nodes (string_of_sockaddr sockaddr) in
let j = List.find (fun (c : computation) -> c.id = id) l in let j = List.find (fun (c : computation) -> c.id = id) l in
SHashtbl.replace nodes SHashtbl.replace nodes
(string_of_sockaddr sockaddr) (string_of_sockaddr sockaddr)
( {n with cpu= n.cpu + j.cpu; ram= n.ram + j.ram} ( port,
, CCList.remove_one ~eq:(fun (c1 : computation) c2 -> c1.id = c2.id) j l ) ; CCList.remove_one ~eq:(fun (c1 : computation) c2 -> c1.id = c2.id) j l );
Lwt.return_unit Lwt.return_unit
let rec launch_job port pass () = let get_stats pass sockaddr =
if not (CCDeque.is_empty jobs) then ( try%lwt
Lwt_io.with_connection sockaddr (fun (ic, oc) ->
Lwt_io.write_line oc (Serialization_j.string_of_query (pass, `STAT))
>>= fun () ->
Lwt_io.read_line ic >>= fun json ->
match
CCResult.guard (fun () -> Serialization_j.stat_of_string json)
with
| Ok (cpu, ram) ->
Logs_lwt.debug (fun m -> m "Stat: cpu: %i ram: %i" cpu ram)
>>= fun () -> Lwt.return (cpu, ram)
| Error e ->
Logs_lwt.err (fun m ->
m "Error during the reception of the stats: %s"
(Printexc.to_string e))
>|= fun () -> (0, 0))
with e ->
Logs_lwt.err (fun m ->
m "Error during the reception of the stats: %s" (Printexc.to_string e))
>|= fun () -> (0, 0)
let send_computation sockaddr pass computation =
Lwt_io.with_connection sockaddr (fun (ic, oc) ->
Lwt_io.write_line oc
(Serialization_j.string_of_query (pass, `COMPUTATION computation))
>>= fun () ->
Lwt_io.flush oc >>= fun () ->
try%lwt
Lwt_io.read_line ic >>= fun json ->
match
CCResult.guard (fun () -> Serialization_j.answer_of_string json)
with
| Ok answer -> (
match answer with
| `Ok ->
Logs_lwt.debug (fun m ->
m "Computation %i,%i successfully sent."
(fst computation.id) (snd computation.id))
>|= fun () -> true
| `Error s ->
Logs_lwt.err (fun m ->
m "Error during the reception of the result %i,%i: %s"
(fst computation.id) (snd computation.id) s)
>|= fun () -> false )
| Error e ->
Logs_lwt.err (fun m ->
m "Error during the reception of the answer: %s"
(Printexc.to_string e))
>|= fun () -> false
with End_of_file ->
Logs_lwt.err (fun m -> m "Error during the read of the answer: EOF")
>|= fun () -> false)
let mutex = ref true
let rec launch_job server_port pass () =
if !mutex && not (CCDeque.is_empty jobs) then (
mutex := false;
let job = CCDeque.peek_front jobs in let job = CCDeque.peek_front jobs in
match SHashtbl.fold
SHashtbl.fold (fun addr (port, l) node ->
(fun k ((n : node), l) node -> node >>= fun node ->
match node with match node with
| Some x -> | Some x -> Lwt.return (Some x)
Some x | None ->
| None -> let sockaddr =
if n.cpu >= job.cpu && n.ram >= job.ram then Some (k, n, l) Unix.ADDR_INET (Unix.inet_addr_of_string addr, port)
else None) in
nodes None get_stats pass sockaddr >|= fun (cpu, ram) ->
with if cpu >= job.cpu && ram >= job.ram then Some (addr, port, l)
else None)
nodes (Lwt.return None)
>>= fun node ->
match node with
| None -> | None ->
mutex := true;
Logs_lwt.debug (fun m -> m "No free node") Logs_lwt.debug (fun m -> m "No free node")
| Some (k, n, l) -> | Some (addr, port, l) ->
let sockaddr = let sockaddr = Unix.ADDR_INET (Unix.inet_addr_of_string addr, port) in
Unix.ADDR_INET (Unix.inet_addr_of_string n.addr, n.port)
in
let computation = let computation =
{ id= (job.id, job.current) {
; env= id = (job.id, job.current);
[ Printf.sprintf "OCLUSTER_ARRAY_TASK_ID=%i" job.current env =
; Printf.sprintf "OCLUSTER_TASK_ID=%i" job.id ] [
; script= job.script Printf.sprintf "OCLUSTER_ARRAY_TASK_ID=%i" job.current;
; args= job.args Printf.sprintf "OCLUSTER_TASK_ID=%i" job.id;
; time= job.time ];
; port script = job.script;
; cpu= job.cpu args = job.args;
; ram= job.ram } time = job.time;
port = server_port;
cpu = job.cpu;
ram = job.ram;
}
in in
SHashtbl.replace nodes k if%lwt send_computation sockaddr pass computation then (
( {n with cpu= n.cpu - computation.cpu; ram= n.ram - computation.ram} ( SHashtbl.replace nodes addr (port, computation :: l);
, computation :: l ) ; if job.current + 1 < job.iteration then
if job.current + 1 < job.iteration then job.current <- job.current + 1 job.current <- job.current + 1
else ignore (CCDeque.take_front jobs) ; else CCDeque.remove_front jobs;
Logs_lwt.debug (fun m -> Lwt.return_unit )
m "Send computation %i,%i to %s on %i" (fst computation.id) >>= fun () ->
(snd computation.id) n.addr n.port) Lwt_unix.sleep 2. >>= fun () ->
>>= fun () -> mutex := true;
Lwt_io.with_connection sockaddr (fun (_ic, oc) -> launch_job server_port pass () ) )
Lwt_io.write_line oc else Lwt.return_unit
(Serialization_j.string_of_query (pass, `COMPUTATION computation)))
(* TODO: check return value *)
>>= Lwt.pause
>>= launch_job port pass )
else Logs_lwt.debug (fun m -> m "No computation")
let server_handler pass port sockaddr (ic, oc) = let server_handler pass port sockaddr (ic, oc) =
Lwt_io.read_line ic Lwt_io.read_line ic >>= fun json ->
>>= fun json ->
match CCResult.guard (fun () -> Serialization_j.query_of_string json) with match CCResult.guard (fun () -> Serialization_j.query_of_string json) with
| Result.Ok (query_pass, query) -> | Result.Ok (query_pass, query) ->
if query_pass = pass then if query_pass = pass then
...@@ -96,33 +149,39 @@ let server_handler pass port sockaddr (ic, oc) = ...@@ -96,33 +149,39 @@ let server_handler pass port sockaddr (ic, oc) =
<&> Logs_lwt.debug (fun m -> <&> Logs_lwt.debug (fun m ->
m "Receive result: %s" m "Receive result: %s"
(Serialization_j.string_of_result (Serialization_j.string_of_result
{result with stdout= "<stdout>"; stderr= "<stderr>"})) {
result with
stdout = "<stdout>";
stderr = "<stderr>";
}))
>|= fun () -> >|= fun () ->
(Lwt.async (fun () -> Lwt.async (fun () ->
Lwt.join Lwt.join
[ ( if String.length result.stdout > 0 then [
Lwt_io.with_file ~mode:Lwt_io.output ( if String.length result.stdout > 0 then
(Printf.sprintf "ocluster_%i_%i.out" (fst result.id) Lwt_io.with_file ~mode:Lwt_io.output
(snd result.id)) (Printf.sprintf "ocluster_%i_%i.out" (fst result.id)
(fun oc -> Lwt_io.write oc result.stdout) (snd result.id))
else Lwt.return_unit ) (fun oc -> Lwt_io.write oc result.stdout)
; ( if String.length result.stderr > 0 then else Lwt.return_unit );
Lwt_io.with_file ~mode:Lwt_io.output ( if String.length result.stderr > 0 then
(Printf.sprintf "ocluster_%i_%i.err" (fst result.id) Lwt_io.with_file ~mode:Lwt_io.output
(snd result.id)) (Printf.sprintf "ocluster_%i_%i.err" (fst result.id)
(fun oc -> Lwt_io.write oc result.stderr) (snd result.id))
else Lwt.return_unit ) (fun oc -> Lwt_io.write oc result.stderr)
; Lwt_io.with_file ~mode:Lwt_io.output else Lwt.return_unit );
(Printf.sprintf "ocluster_%i_%i.log" (fst result.id) Lwt_io.with_file ~mode:Lwt_io.output
(snd result.id)) (Printf.sprintf "ocluster_%i_%i.log" (fst result.id)
(fun oc -> (snd result.id))
Lwt_io.write oc (fun oc ->
(Printf.sprintf Lwt_io.write oc
"Job completed at %f\nReturn code: %s\n" (Printf.sprintf
(Unix.time ()) "Job completed at %f\nReturn code: %s\n"
(string_of_ret_code result.ret_code))) (Unix.time ())
; end_job result.id sockaddr >>= Lwt.pause (string_of_ret_code result.ret_code)));
>>= launch_job port pass ])) end_job result.id sockaddr >>= Lwt.pause
>>= launch_job port pass;
])
| `JOB submission -> | `JOB submission ->
Lwt_io.write_line oc (Serialization_j.string_of_answer `Ok) Lwt_io.write_line oc (Serialization_j.string_of_answer `Ok)
>>= (fun () -> Lwt_io.flush oc) >>= (fun () -> Lwt_io.flush oc)
...@@ -132,19 +191,21 @@ let server_handler pass port sockaddr (ic, oc) = ...@@ -132,19 +191,21 @@ let server_handler pass port sockaddr (ic, oc) =
>>= fun () -> >>= fun () ->
let empty = CCDeque.is_empty jobs in let empty = CCDeque.is_empty jobs in
let job = let job =
{ id= !jobs_id {
; name = submission.name id = !jobs_id;
; current= 0 name = submission.name;
; iteration= submission.iteration current = 0;
; script= submission.script iteration = submission.iteration;
; args= submission.args script = submission.script;
; time= submission.time args = submission.args;
; port time = submission.time;
; cpu= submission.cpu port;
; ram= submission.ram } cpu = submission.cpu;
ram = submission.ram;
}
in in
CCDeque.push_back jobs job ; CCDeque.push_back jobs job;
incr jobs_id ; incr jobs_id;
if empty then launch_job port pass () else Lwt.return_unit if empty then launch_job port pass () else Lwt.return_unit
| `JOBQ (id, delete) as jobq -> | `JOBQ (id, delete) as jobq ->
Lwt_io.write_line oc (Serialization_j.string_of_answer `Ok) Lwt_io.write_line oc (Serialization_j.string_of_answer `Ok)
...@@ -158,20 +219,19 @@ let server_handler pass port sockaddr (ic, oc) = ...@@ -158,20 +219,19 @@ let server_handler pass port sockaddr (ic, oc) =
try (CCDeque.peek_back jobs).id with CCDeque.Empty -> 0 try (CCDeque.peek_back jobs).id with CCDeque.Empty -> 0
in in
let id = CCOpt.get_or ~default:last_id id in let id = CCOpt.get_or ~default:last_id id in
CCDeque.filter_in_place jobs (fun j -> j.id != id) ; CCDeque.filter_in_place jobs (fun j -> j.id != id);
Logs_lwt.info (fun m -> m "Removing jobs with id %i" id) ) Logs_lwt.info (fun m -> m "Removing jobs with id %i" id) )
else else
(let jobs_list = let jobs_list =
match id with match id with
| None -> | None -> CCDeque.to_list jobs
CCDeque.to_list jobs | Some id ->
| Some id -> CCDeque.(
CCDeque.( filter (fun (j : job) -> j.id = id) jobs |> to_list)
filter (fun (j : job) -> j.id = id) jobs |> to_list) in
in Lwt_io.write_line oc
Lwt_io.write_line oc (Serialization_j.string_of_query_data (`JOBS jobs_list))
(Serialization_j.string_of_query_data (`JOBS jobs_list)) >>= fun () -> Lwt_io.flush oc
>>= fun () -> Lwt_io.flush oc)
| _ -> | _ ->
Lwt_io.write_line oc Lwt_io.write_line oc
(Serialization_j.string_of_answer (`Error "Unwanted command")) (Serialization_j.string_of_answer (`Error "Unwanted command"))
...@@ -186,31 +246,32 @@ let server_handler pass port sockaddr (ic, oc) = ...@@ -186,31 +246,32 @@ let server_handler pass port sockaddr (ic, oc) =
Lwt_io.write_line oc Lwt_io.write_line oc
(Serialization_j.string_of_answer (Serialization_j.string_of_answer
(`Error (`Error
(Printf.sprintf (Printf.sprintf "Error during the reception of the computation: %s"
"Error during the reception of the computation: %s"
(Printexc.to_string e)))) (Printexc.to_string e))))
>>= (fun () -> Lwt_io.flush oc) >>= (fun () -> Lwt_io.flush oc)
<&> Logs_lwt.err (fun m -> <&> Logs_lwt.err (fun m ->
m "Error during the reception of the computation: %s" m "Error during the reception of the computation: %s"
(Printexc.to_string e)) (Printexc.to_string e))
let rec timer f = Lwt_unix.sleep 1800. >>= f >>= fun () -> timer f
let stop_server resolver server _ = Lwt.wakeup_later resolver server let stop_server resolver server _ = Lwt.wakeup_later resolver server
let cmd config () = let cmd config () =
let conf = let conf =
Serialization_j.master_conf_of_string (CCIO.with_in config CCIO.read_all) Serialization_j.master_conf_of_string (CCIO.with_in config CCIO.read_all)
in in
List.iter (fun n -> SHashtbl.add nodes n.addr (n, [])) conf.nodes ; List.iter (fun n -> SHashtbl.add nodes n.addr (n.port, [])) conf.nodes;
let promise, resolver = Lwt.task () in let promise, resolver = Lwt.task () in
Logs_lwt.info (fun m -> m "master at %i with pass %s" conf.port conf.pass) Logs_lwt.info (fun m -> m "master at %i with pass %s" conf.port conf.pass)
>>= fun () -> >>= fun () ->
Lwt_io.establish_server_with_client_address ( Lwt.async (fun () -> timer (launch_job conf.port conf.pass));
(Unix.ADDR_INET (Unix.inet_addr_any, conf.port)) Lwt_io.establish_server_with_client_address
(server_handler conf.pass conf.port) (Unix.ADDR_INET (Unix.inet_addr_any, conf.port))
>>= fun server -> (server_handler conf.pass conf.port) )
let _ = Lwt_unix.on_signal 15 (stop_server resolver server)
and _ = Lwt_unix.on_signal 2 (stop_server resolver server) in
promise
>>= fun server -> >>= fun server ->
let _ = Lwt_unix.on_signal 15 (stop_server resolver ())
and _ = Lwt_unix.on_signal 2 (stop_server resolver ()) in
promise >>= fun () ->
Lwt_io.shutdown_server server Lwt_io.shutdown_server server
<&> Logs_lwt.info (fun m -> m "Shuting down server") <&> Logs_lwt.info (fun m -> m "Shuting down server")
...@@ -3,22 +3,53 @@ open Serialization_t ...@@ -3,22 +3,53 @@ open Serialization_t
let max_std = 5120000 (* 5Mo *) let max_std = 5120000 (* 5Mo *)
let get_ram () =
CCIO.(with_in "/proc/meminfo" read_lines_l)
|> CCList.find_map (fun l ->
if CCString.prefix ~pre:"MemAvailable" l then
match String.split_on_char ':' l with
| [ _; ram ] ->
Option.bind
(String.split_on_char ' ' (String.trim ram) |> CCList.head_opt)
CCInt.of_string
| _ -> None
else None)
let get_load () =
Option.bind
CCIO.(with_in "/proc/loadavg" read_line)
(fun s ->
match String.split_on_char ' ' s with
| v :: _ -> Some (CCFloat.of_string_exn v)
| _ -> None)
let get_cpu_count () =
CCIO.(with_in "/proc/cpuinfo" read_lines_l)
|> CCList.find_map (fun l ->
if CCString.prefix ~pre:"cpu cores" l then
match String.split_on_char ':' l with
| [ _; ncpu ] -> String.trim ncpu |> CCInt.of_string
| _ -> None
else None)
let get_available_cpu () =
match (get_cpu_count (), get_load ()) with
| Some cpu_count, Some load -> cpu_count - CCFloat.(ceil (load +. 1.5) |> to_int)
| _ -> 0
let get_available_ram () =
match get_ram () with Some ram -> (ram / 1024) - 4096 | None -> 0
let process_status_to_ret_code = function let process_status_to_ret_code = function
| Unix.WEXITED c -> | Unix.WEXITED c -> `WEXITED c
`WEXITED c | Unix.WSIGNALED s -> `WSIGNALED s
| Unix.WSIGNALED s -> | Unix.WSTOPPED s -> `WSTOPPED s
`WSIGNALED s
| Unix.WSTOPPED s ->
`WSTOPPED s
let run_computation (computation : computation) = let run_computation (computation : computation) =
Lwt_io.with_temp_file ~prefix:"ocluster" ~perm:0o700 (fun (name, oc) -> Lwt_io.with_temp_file ~prefix:"ocluster" ~perm:0o700 (fun (name, oc) ->
Lwt_io.write_line oc computation.script Lwt_io.write_line oc computation.script >>= fun () ->
>>= fun () -> Lwt_io.close oc >>= fun () ->
Lwt_io.close oc Lwt_unix.sleep 2. >>= fun () ->
>>= fun () ->
Lwt_unix.sleep 2.
>>= fun () ->
let read_stderr, write_stderr = Lwt_unix.pipe_in () let read_stderr, write_stderr = Lwt_unix.pipe_in ()
and read_stdout, write_stdout = Lwt_unix.pipe_in () in and read_stdout, write_stdout = Lwt_unix.pipe_in () in
let%lwt ret_code = let%lwt ret_code =
...@@ -28,20 +59,20 @@ let run_computation (computation : computation) = ...@@ -28,20 +59,20 @@ let run_computation (computation : computation) =
(name, Array.of_list (name :: computation.args)) (name, Array.of_list (name :: computation.args))
and stdout = Lwt_io.read (Lwt_io.of_fd ~mode:Lwt_io.input read_stdout) and stdout = Lwt_io.read (Lwt_io.of_fd ~mode:Lwt_io.input read_stdout)
and stderr = Lwt_io.read (Lwt_io.of_fd ~mode:Lwt_io.input read_stderr) in and stderr = Lwt_io.read (Lwt_io.of_fd ~mode:Lwt_io.input read_stderr) in
Lwt_unix.close read_stderr Lwt_unix.close read_stderr >>= fun () ->
>>= fun () -> Lwt_unix.close read_stdout >|= fun () ->
Lwt_unix.close read_stdout {
>|= fun () -> id = computation.id;
{ id= computation.id stdout =
; stdout= ( if String.length stdout > max_std then
( if String.length stdout > max_std then CCString.drop (String.length stdout - max_std) stdout
CCString.drop (String.length stdout - max_std) stdout else stdout );
else stdout ) stderr =
; stderr= ( if String.length stderr > max_std then
( if String.length stderr > max_std then CCString.drop (String.length stderr - max_std) stderr
CCString.drop (String.length stderr - max_std) stderr else stderr );
else stderr ) ret_code = process_status_to_ret_code ret_code;
; ret_code= process_status_to_ret_code ret_code }) })
let rec send_result sockaddr pass result = let rec send_result sockaddr pass result =
if%lwt if%lwt
...@@ -49,26 +80,24 @@ let rec send_result sockaddr pass result = ...@@ -49,26 +80,24 @@ let rec send_result sockaddr pass result =
Lwt_io.write_line oc Lwt_io.write_line oc
(Serialization_j.string_of_query (pass, `RESULT result)) (Serialization_j.string_of_query (pass, `RESULT result))
>>= fun () -> >>= fun () ->
Lwt_io.flush oc Lwt_io.flush oc >>= fun () ->
>>= fun () ->
try%lwt try%lwt
Lwt_io.read_line ic Lwt_io.read_line ic >>= fun json ->
>>= fun json ->
match match
CCResult.guard (fun () -> Serialization_j.answer_of_string json) CCResult.guard (fun () -> Serialization_j.answer_of_string json)
with with
| Ok answer -> ( | Ok answer -> (
match answer with match answer with
| `Ok -> | `Ok ->
Logs_lwt.debug (fun m -> Logs_lwt.debug (fun m ->
m "Result %i,%i successfully sent." (fst result.id) m "Result %i,%i successfully sent." (fst result.id)
(snd result.id)) (snd result.id))
>|= fun () -> false >|= fun () -> false
| `Error s -> | `Error s ->
Logs_lwt.err (fun m -> Logs_lwt.err (fun m ->
m "Error during the reception of the result %i,%i: %s" m "Error during the reception of the result %i,%i: %s"
(fst result.id) (snd result.id) s) (fst result.id) (snd result.id) s)
>|= fun () -> true ) >|= fun () -> true )
| Error e -> | Error e ->
Logs_lwt.err (fun m -> Logs_lwt.err (fun m ->
m "Error during the reception of the answer: %s" m "Error during the reception of the answer: %s"
...@@ -80,22 +109,18 @@ let rec send_result sockaddr pass result = ...@@ -80,22 +109,18 @@ let rec send_result sockaddr pass result =
then send_result sockaddr pass result then send_result sockaddr pass result
let handle_computation sockaddr pass computation () = let handle_computation sockaddr pass computation () =
run_computation computation run_computation computation >>= fun result ->
>>= fun result ->
let sockaddr = let sockaddr =
match sockaddr with match sockaddr with
| Unix.ADDR_INET (a, _) -> | Unix.ADDR_INET (a, _) -> Unix.ADDR_INET (a, computation.port)
Unix.ADDR_INET (a, computation.port) | s -> s
| s ->
s
in in
send_result sockaddr pass result send_result sockaddr pass result
<&> Logs_lwt.debug (fun m -> <&> Logs_lwt.debug (fun m ->
m "End computation %i,%i" (fst computation.id) (snd computation.id)) m "End computation %i,%i" (fst computation.id) (snd computation.id))
let server_handler pass sockaddr (ic, oc) = let server_handler pass sockaddr (ic, oc) =
Lwt_io.read_line ic Lwt_io.read_line ic >>= fun json ->
>>= fun json ->
( match CCResult.guard (fun () -> Serialization_j.query_of_string json) with ( match CCResult.guard (fun () -> Serialization_j.query_of_string json) with
| Result.Ok (query_pass, query) -> | Result.Ok (query_pass, query) ->
if query_pass = pass then if query_pass = pass then
...@@ -106,11 +131,16 @@ let server_handler pass sockaddr (ic, oc) =