preemptive.go 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. package cluster
  2. import (
  3. "errors"
  4. "net"
  5. "time"
  6. "github.com/langgenius/dify-plugin-daemon/internal/utils/cache"
  7. "github.com/langgenius/dify-plugin-daemon/internal/utils/network"
  8. "github.com/langgenius/dify-plugin-daemon/internal/utils/parser"
  9. )
  10. // Plugin daemon will preemptively try to lock the slot to be the master of the cluster
  11. // and keep update current status of the whole cluster
  12. // once the master is no longer active, one of the slave will try to lock the slot again
  13. // and become the new master
  14. //
  15. // Once a node becomes master, It will take responsibility to gc the nodes has already deactivated
  16. // and all nodes should to maintenance their own status
  17. //
  18. // State:
  19. // - hashmap[cluster-status]
  20. // - node-id:
  21. // - list[ip]:
  22. // - address: string
  23. // - vote: int
  24. // - last_ping_at: int64
  25. // - preemption-lock: node-id
  26. // - node-status-upgrade-status
  27. //
  28. // A node will be removed from the cluster if it is no longer active
  29. const (
  30. CLUSTER_STATUS_HASH_MAP_KEY = "cluster-status-hash-map"
  31. PREEMPTION_LOCK_KEY = "cluster-master-preemption-lock"
  32. )
  33. // try lock the slot to be the master of the cluster
  34. // returns:
  35. // - bool: true if the slot is locked by the node
  36. // - error: error if any
  37. func (c *Cluster) lockMaster() (bool, error) {
  38. var final_error error
  39. for i := 0; i < 3; i++ {
  40. if success, err := cache.SetNX(PREEMPTION_LOCK_KEY, c.id, MASTER_LOCK_EXPIRED_TIME); err != nil {
  41. // try again
  42. if final_error == nil {
  43. final_error = err
  44. } else {
  45. final_error = errors.Join(final_error, err)
  46. }
  47. } else if !success {
  48. return false, nil
  49. } else {
  50. return true, nil
  51. }
  52. }
  53. return false, final_error
  54. }
  55. // update master
  56. func (c *Cluster) updateMaster() error {
  57. // update expired time of master key
  58. if _, err := cache.Expire(PREEMPTION_LOCK_KEY, MASTER_LOCK_EXPIRED_TIME); err != nil {
  59. return err
  60. }
  61. return nil
  62. }
  63. // update the status of the node
  64. func (c *Cluster) updateNodeStatus() error {
  65. if err := c.LockNodeStatus(c.id); err != nil {
  66. return err
  67. }
  68. defer c.UnlockNodeStatus(c.id)
  69. // update the status of the node
  70. node_status, err := cache.GetMapField[node](CLUSTER_STATUS_HASH_MAP_KEY, c.id)
  71. if err != nil {
  72. if err == cache.ErrNotFound {
  73. // try to get ips configs
  74. ips, err := network.FetchCurrentIps()
  75. if err != nil {
  76. return err
  77. }
  78. node_status = &node{
  79. Ips: parser.Map(func(from net.IP) ip {
  80. return ip{
  81. Address: from.String(),
  82. Votes: []vote{},
  83. }
  84. }, ips),
  85. }
  86. } else {
  87. return err
  88. }
  89. } else {
  90. ips, err := network.FetchCurrentIps()
  91. if err != nil {
  92. return err
  93. }
  94. // add new ip if not exist
  95. for _, _ip := range ips {
  96. found := false
  97. for _, node_ip := range node_status.Ips {
  98. if node_ip.Address == _ip.String() {
  99. found = true
  100. break
  101. }
  102. }
  103. if !found {
  104. node_status.Ips = append(node_status.Ips, ip{
  105. Address: _ip.String(),
  106. Votes: []vote{},
  107. })
  108. }
  109. }
  110. }
  111. // refresh the last ping time
  112. node_status.LastPingAt = time.Now().Unix()
  113. // update the status of the node
  114. if err := cache.SetMapOneField(CLUSTER_STATUS_HASH_MAP_KEY, c.id, node_status); err != nil {
  115. return err
  116. }
  117. return nil
  118. }
  119. func (c *Cluster) IsMaster() bool {
  120. return c.i_am_master
  121. }
  122. func (c *Cluster) IsNodeAlive(node_id string) bool {
  123. node_status, err := cache.GetMapField[node](CLUSTER_STATUS_HASH_MAP_KEY, node_id)
  124. if err != nil {
  125. return false
  126. }
  127. return time.Since(time.Unix(node_status.LastPingAt, 0)) < NODE_DISCONNECTED_TIMEOUT
  128. }