Skip to content

Commit

Permalink
https://github.com/uavorg/uavstack/issues/470
Browse files Browse the repository at this point in the history
进程死亡区分MA挂掉
  • Loading branch information
fyb007 committed Dec 28, 2018
1 parent 74f06c7 commit 57f7445
Showing 1 changed file with 56 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import com.creditease.agent.ConfigurationManager;
import com.creditease.agent.helpers.DataConvertHelper;
import com.creditease.agent.helpers.JSONHelper;
import com.creditease.agent.helpers.StringHelper;
import com.creditease.agent.monitor.api.MonitorDataFrame;
import com.creditease.agent.monitor.api.NotificationEvent;
import com.creditease.agent.spi.AbstractTimerWork;
Expand Down Expand Up @@ -61,7 +62,7 @@ private static class CrashEventObj {

private String ip;
private String appgroup;
private int deadProcsCount = 0;
private String nodeuuid;
private List<String> deadProcsInfo = new ArrayList<String>();
private List<String> deadProcNames = new ArrayList<String>();

Expand All @@ -81,14 +82,19 @@ public String getIp() {
return ip;
}

public String getNodeuuid() {

return nodeuuid;
}

public int getDeadProcsCount() {

return deadProcsCount;
return deadProcNames.size();
}

public void increDeadProcsCount() {
public void setNodeuuid(String nodeuuid) {

deadProcsCount++;
this.nodeuuid = nodeuuid;
}

public void addDeadProcName(String name) {
Expand Down Expand Up @@ -117,7 +123,7 @@ public String getDeadProcsInfoAsString() {

StringBuffer sb = new StringBuffer();
for (String dpi : deadProcsInfo) {
sb.append(dpi + "\n");
sb.append(dpi).append("\n");
}

return sb.toString().replace("\\", "\\\\");
Expand All @@ -130,12 +136,12 @@ public String getDeadProcsInfoAsString() {
private static final String CRASH_PROCS = "rtnotify.dead.procs";
private static final String CRASH_PROCS_DETAIL = "rtnotify.dead.procs.detail";
private static final long LOCK_TIMEOUT = 30 * 1000;
private static final long DEFAULT_CRASH_TIMEOUT = 5 * 60 * 1000;
private static final long MIN_RANDOM_PORT = 32768;

private CacheManager cm;
private CacheLock lock;
private int hold;
private int timeout;
private boolean isSendMq;
private boolean isExchange;

Expand All @@ -149,6 +155,7 @@ public NodeInfoWatcher(String cName, String feature) {

hold = DataConvertHelper.toInt(getConfigManager().getFeatureConfiguration(feature, "nodeinfotimer.period"),
15000);
timeout = DataConvertHelper.toInt(getConfigManager().getFeatureConfiguration(feature, "crash.timeout"), 300000);
isSendMq = DataConvertHelper
.toBoolean(getConfigManager().getFeatureConfiguration(feature, "nodeinfoprocess.sendmq"), true);

Expand Down Expand Up @@ -204,7 +211,7 @@ public void run() {
/**
* Step 3: check if any proc crash
*/
judgeProcCrash();
judgeProcCrash(data);

/**
* Step 4: push data to runtimenotify mgr or to mq
Expand Down Expand Up @@ -235,27 +242,27 @@ private List<Map<String, Object>> syncProcInfoToCache(Map<String, String> data)
Map<String, String> fieldValues = new HashMap<String, String>();
Map<String, String> fieldValuesDetail = new HashMap<String, String>();

for (String node : data.values()) {

Map<String, Object> mdfMap = buildMDF(node);
for (Map.Entry<String, String> entry : data.entrySet()) {

Map<String, Object> mdfMap = buildMDF(entry.getValue());
MonitorDataFrame mdf = new MonitorDataFrame(mdfMap);

String time = mdf.getTimeFlag() + "";
List<Map> els = mdf.getElemInstances("server", "procState");
for (Map el : els) {
try {
String group = mdf.getExt("appgroup");
String ip = mdf.getIP();

@SuppressWarnings("unchecked")
Map<String, Object> m = (Map<String, Object>) el.get("values");
String hashKey = genProcHashKey(mdf.getIP(), m);

String hashKey = genProcHashKey(ip, m);
Map<String, String> detail = new HashMap<String, String>();
detail.put("appgroup", mdf.getExt("appgroup"));
detail.put("nodeuuid", entry.getKey());

// 分别存时间戳和group
fieldValues.put(hashKey, time);
fieldValuesDetail.put(hashKey, group);
fieldValuesDetail.put(hashKey, JSONHelper.toString(detail));
}
catch (Exception e) {
log.err(this, "Sync ProcInfo To Cache Fail." + " ProcInfo:" + JSONHelper.toString(el), e);
Expand Down Expand Up @@ -287,29 +294,18 @@ private List<Map<String, Object>> syncProcInfoToCache(Map<String, String> data)
*
* 4.时间戳超过进程死亡时间(可配置)则保存至死亡进程list并在redis中删除该进程。
*/
private void judgeProcCrash() {
private void judgeProcCrash(Map<String, String> data) {

if (log.isDebugEnable()) {
log.debug(this, "NodeInfoWatcher Judge Crash START.");
}

Map<String, String> allProcs = null;
Map<String, String> allProcDetails = null;
try {
allProcs = cm.getHashAll(UAV_CACHE_REGION, CRASH_PROCS);
allProcDetails = cm.getHashAll(UAV_CACHE_REGION, CRASH_PROCS_DETAIL);
}
catch (Exception e) {
log.err(this, "Fail to get all process info", e);
return;
}

Map<String, String> allProcs = cm.getHashAll(UAV_CACHE_REGION, CRASH_PROCS);
Map<String, String> allProcDetails = cm.getHashAll(UAV_CACHE_REGION, CRASH_PROCS_DETAIL);
if (allProcs == null) {
return;
}

String cfgTimeout = getConfigManager().getFeatureConfiguration(feature, "crash.timeout");
long timeout = DataConvertHelper.toLong(cfgTimeout, DEFAULT_CRASH_TIMEOUT);
long deadline = System.currentTimeMillis() - timeout;

List<String> delKeys = new ArrayList<>();
Expand Down Expand Up @@ -406,18 +402,19 @@ private void judgeProcCrash() {
for (String key : deadKeys) {
Map<String, String> procDetail = new HashMap<String, String>();
procDetail.put("deadtime", allProcs.get(key));
procDetail.put("appgroup", allProcDetails.get(key));
procDetail.put("detail", allProcDetails.get(key));

deadProcs.put(key, procDetail);
}

fireEvent(deadProcs);
fireEvent(deadProcs, data);
}

/**
* 触发预警事件
*/
private void fireEvent(Map<String, Map<String, String>> deadProcs) {
@SuppressWarnings("unchecked")
private void fireEvent(Map<String, Map<String, String>> deadProcs, Map<String, String> data) {

/**
* Step 1: split crash event by IP
Expand All @@ -430,21 +427,33 @@ private void fireEvent(Map<String, Map<String, String>> deadProcs) {
String[] procInfo = procKey.split("_", -1);
String ip = procInfo[0];
String procName = procInfo[1];

String deadtime = en.getValue().get("deadtime");
String appgroup = en.getValue().get("appgroup");

Map<String, String> map = en.getValue();

String deadtime = map.get("deadtime");
String appgroup;
String nodeuuid = "";

if(map.get("detail") != null) {
Map<String, String> detail = JSONHelper.toObject(map.get("detail"), Map.class);
appgroup = detail.get("appgroup");
nodeuuid = detail.get("nodeuuid");
}
else{
appgroup = map.get("appgroup");
}

CrashEventObj ceo;

if (!ips.containsKey(ip)) {
ceo = new CrashEventObj(ip, appgroup);
ceo.setNodeuuid(nodeuuid);
ips.put(ip, ceo);
}
else {
ceo = ips.get(ip);
}

ceo.increDeadProcsCount();
ceo.addDeadProcName(procName);
ceo.addDeadProcInfo("触发时间:" + format.format(new Date(Long.parseLong(deadtime))) + ", 进程信息:" + procKey);
}
Expand All @@ -455,9 +464,18 @@ private void fireEvent(Map<String, Map<String, String>> deadProcs) {
RuntimeNotifyStrategyMgr strategyMgr = (RuntimeNotifyStrategyMgr) getConfigManager().getComponent(this.feature,
"RuntimeNotifyStrategyMgr");
for (CrashEventObj ceo : ips.values()) {

String title;
String nodeuuid = ceo.getNodeuuid();
if (!StringHelper.isEmpty(nodeuuid) && StringHelper.isEmpty(data.get(nodeuuid))) {
title = "应用组[" + ceo.getAppGroup() + "]的" + ceo.getIp() + "监控代理程序(MonitorAgent)超过" + timeout / 1000
+ "秒没有心跳数据上送";
}
else {
title = "应用组[" + ceo.getAppGroup() + "]的" + ceo.getIp() + "共发现" + ceo.getDeadProcsCount() + "进程"
+ ceo.getDeadProcNamesAsString() + "可疑死掉";
}

String title = "应用组[" + ceo.getAppGroup() + "]的" + ceo.getIp() + "共发现" + ceo.getDeadProcsCount() + "进程"
+ ceo.getDeadProcNamesAsString() + "可疑死掉";
String description = ceo.getDeadProcsInfoAsString();

NotificationEvent event = new NotificationEvent(NotificationEvent.EVENT_RT_ALERT_CRASH, title, description, System.currentTimeMillis(), ceo.getIp(), "");
Expand All @@ -466,8 +484,8 @@ private void fireEvent(Map<String, Map<String, String>> deadProcs) {
* Notification Manager will not block the event, the frozen time has no effect to this event
*/
event.addArg(NotificationEvent.EVENT_Tag_NoBlock, "true");
// add appgroup
event.addArg("appgroup", ceo.getAppGroup());
event.addArg("nodeuuid", nodeuuid);

NotifyStrategy stra = strategyMgr.seekStrategy("server@procCrash@" + ceo.getIp());

Expand Down

0 comments on commit 57f7445

Please sign in to comment.