今天在测试Seaweedfs时候发现一个恐怖的现象。我启动了6个Master和10个Volume。当其中一个Master挂掉的时候,会有N个Volume也看不到了。
问题重现步骤
配置信息
-
Master节点配置:
- 六个Master节点。
- 占用端口 9001-9006
-
启动命令:
-
/data/weed/bin/weed master -defaultReplication="100" -mdir="/data/weed/runtime/c1/master2" -port=9002 -peers="127.0.0.1:9001,
127.0.0.1:9002,
127.0.0.1:9003,
127.0.0.1:9004,
127.0.0.1:9005,
127.0.0.1:9006"
-
-
Volume节点配置:
- 10个Volume节点
- 占用端口:9051-9055、9061-9065
-
启动命令
-
/data/weed/bin/weed volume -dataCenter="c2" -rack="r2" -dir="/data/weed/runtime/c2/volume/2" -port=9062 -max="5" -mserver="127.0.0.1:9001"
-
重现步骤
-
启动所有节点
-
查看seaweedfs配置信息,10个Volume节点全部在线
-
停掉一个非Leader的Master节点
-
查看seaweedfs配置信息,c2的r2和r4消失了。
问题原因
为了查找问题的原因,特意从https://github.com/chrislusf/seaweedfs下载了源代码
1.首先查看weed/volume.go
文件内容(128行)
//这里创建了一个VolumeServer。
volumeServer := weed_server.NewVolumeServer(volumeMux, publicVolumeMux,
*v.ip, *v.port, *v.publicUrl,
v.folders, v.folderMaxLimits,
volumeNeedleMapKind,
*v.master, *v.pulseSeconds, *v.dataCenter, *v.rack,
v.whiteList,
*v.fixJpgOrientation, *v.readRedirect,
)
2.在weed/weed_server/volume_server.go
中找到NewVolumeServer
方法(28-102行)
//同步执行了一个循环
go func() {
connected := true
glog.V(0).Infof("Volume server bootstraps with master %s", vs.GetMasterNode())
vs.store.SetBootstrapMaster(vs.GetMasterNode())
vs.store.SetDataCenter(vs.dataCenter)
vs.store.SetRack(vs.rack)
for {
glog.V(4).Infof("Volume server sending to master %s", vs.GetMasterNode())
//在这里进行了心跳检测
master, secretKey, err := vs.store.SendHeartbeatToMaster()
if err == nil { //如果心跳检测没有发生错误,并且没有连接,则执行连接操作
if !connected {
connected = true
vs.SetMasterNode(master)
vs.guard.SecretKey = secretKey
glog.V(0).Infoln("Volume Server Connected with master at", master)
}
} else { //如果心跳检测发生异常,设置成未连接状态,但是不会将MasterNode设为无效。
glog.V(1).Infof("Volume Server Failed to talk with master %s: %v", vs.masterNode, err)
if connected {
connected = false
}
}
//循环等待时间
if connected {
time.Sleep(time.Duration(float32(vs.pulseSeconds*1e3)*(1+rand.Float32())) * time.Millisecond)
} else {
time.Sleep(time.Duration(float32(vs.pulseSeconds*1e3)*0.25) * time.Millisecond)
}
}
}()
3.在storage/store.go
中找到SendHeartbeatToMaster
方法(261-339行)
//查找Master节点
masterNode, e = s.masterNodes.findMaster()
if e != nil {
return
}
var volumeMessages []*operation.VolumeInformationMessage
maxVolumeCount := 0
var maxFileKey uint64
for _, location := range s.Locations {
maxVolumeCount = maxVolumeCount + location.MaxVolumeCount
for k, v := range location.volumes {
if maxFileKey < v.nm.MaxFileKey() {
maxFileKey = v.nm.MaxFileKey()
}
if !v.expired(s.volumeSizeLimit) {
volumeMessage := &operation.VolumeInformationMessage{
Id: proto.Uint32(uint32(k)),
Size: proto.Uint64(uint64(v.Size())),
Collection: proto.String(v.Collection),
FileCount: proto.Uint64(uint64(v.nm.FileCount())),
DeleteCount: proto.Uint64(uint64(v.nm.DeletedCount())),
DeletedByteCount: proto.Uint64(v.nm.DeletedSize()),
ReadOnly: proto.Bool(v.readOnly),
ReplicaPlacement: proto.Uint32(uint32(v.ReplicaPlacement.Byte())),
Version: proto.Uint32(uint32(v.Version())),
Ttl: proto.Uint32(v.Ttl.ToUint32()),
}
volumeMessages = append(volumeMessages, volumeMessage)
} else {
if v.exiredLongEnough(MAX_TTL_VOLUME_REMOVAL_DELAY) {
s.DeleteVolume(location.volumes, v)
glog.V(0).Infoln("volume", v.Id, "is deleted.")
} else {
glog.V(0).Infoln("volume", v.Id, "is expired.")
}
}
}
}
joinMessage := &operation.JoinMessage{
IsInit: proto.Bool(!s.connected),
Ip: proto.String(s.Ip),
Port: proto.Uint32(uint32(s.Port)),
PublicUrl: proto.String(s.PublicUrl),
MaxVolumeCount: proto.Uint32(uint32(maxVolumeCount)),
MaxFileKey: proto.Uint64(maxFileKey),
DataCenter: proto.String(s.dataCenter),
Rack: proto.String(s.rack),
Volumes: volumeMessages,
}
data, err := proto.Marshal(joinMessage)
if err != nil {
return "", "", err
}
//加入Master节点
joinUrl := "http://" + masterNode + "/dir/join"
glog.V(4).Infof("Connecting to %s ...", joinUrl)
jsonBlob, err := util.PostBytes(joinUrl, data)
if err != nil {
s.masterNodes.reset()
return "", "", err
}
var ret operation.JoinResult
if err := json.Unmarshal(jsonBlob, &ret); err != nil {
glog.V(0).Infof("Failed to join %s with response: %s", joinUrl, string(jsonBlob))
s.masterNodes.reset()
return masterNode, "", err
}
if ret.Error != "" {
s.masterNodes.reset()
return masterNode, "", errors.New(ret.Error)
}
s.volumeSizeLimit = ret.VolumeSizeLimit
secretKey = security.Secret(ret.SecretKey)
s.connected = true
return
4.在storage/store.go
中找到findMaster
方法(52-76行)
func (mn *MasterNodes) findMaster() (string, error) {
if len(mn.nodes) == 0 {
return "", errors.New("No master node found!")
}
//如果最后一个node小于0,才会在里面注入新的node
if mn.lastNode < 0 {
for _, m := range mn.nodes {
glog.V(4).Infof("Listing masters on %s", m)
//获取所有的masters节点
if masters, e := operation.ListMasters(m); e == nil {
if len(masters) == 0 {
continue
}
mn.nodes = append(masters, m)
//随机设置一个master,即该Volume随机注册到一个Master节点上面。
mn.lastNode = rand.Intn(len(mn.nodes))
glog.V(2).Infof("current master nodes is %v", mn)
break
} else {
glog.V(4).Infof("Failed listing masters on %s: %v", m, e)
}
}
}
if mn.lastNode < 0 {
return "", errors.New("No master node available!")
}
//返回lastNode为Master节点
return mn.nodes[mn.lastNode], nil
}
总结
Volume节点会随机注册到一个Master节点。在weed/weed_server/volume_server.go
调用SendHeartbeatToMaster
出现异常之后,只是将Volume的状态设置成未连接了,并没有将所注册到的Master标记失效/注销。
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/10789.html