logthread.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2014 HPCC Systems.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #include "jmisc.hpp"
  14. #include "jexcept.hpp"
  15. #include "jdebug.hpp"
  16. #include "LoggingErrors.hpp"
  17. #include "LogSerializer.hpp"
  18. #include "logthread.hpp"
  19. const char* const PropMaxLogQueueLength = "MaxLogQueueLength";
  20. const char* const PropQueueSizeSignal = "QueueSizeSignal";
  21. const char* const PropMaxTriesRS = "MaxTriesRS";
  22. const char* const PropFailSafe = "FailSafe";
  23. const char* const PropFailSafeLogsDir = "FailSafeLogsDir";
  24. #define MaxLogQueueLength 500000 //Write a warning into log when queue length is greater than 500000
  25. #define QueueSizeSignal 10000 //Write a warning into log when queue length is increased by 10000
  26. const int DefaultMaxTriesRS = -1; // Max. # of attempts to send log message to WsReportService. Default: infinite
  27. extern LOGGINGCOMMON_API IUpdateLogThread* createUpdateLogThread(IPropertyTree* _cfg, const char* _service, const char* _agentName, IEspLogAgent* _logAgent)
  28. {
  29. if (!_cfg)
  30. return NULL;
  31. IUpdateLogThread* loggingThread = new CLogThread(_cfg, _service, _agentName, _logAgent);
  32. loggingThread->start();
  33. return loggingThread;
  34. }
  35. CLogThread::CLogThread(IPropertyTree* _cfg , const char* _service, const char* _agentName, IEspLogAgent* _logAgent)
  36. : stopping(false), agentName(_agentName)
  37. {
  38. if(!_agentName || !*_agentName)
  39. throw MakeStringException(-1,"No Logging agent name defined");
  40. if(!_cfg)
  41. throw MakeStringException(-1,"No Logging agent Configuration for %s", _agentName);
  42. if(!_service || !*_service)
  43. throw MakeStringException(-1,"No service name defined for %s", _agentName);
  44. if(!_logAgent)
  45. throw MakeStringException(-1,"No Logging agent interface for %s", _agentName);
  46. const char* servicesConfig = _cfg->queryProp("@services");
  47. if (!servicesConfig || !*servicesConfig)
  48. throw MakeStringException(-1,"No Logging Service defined for %s", _agentName);
  49. StringArray serviceArray;
  50. serviceArray.appendListUniq(servicesConfig, ",");
  51. unsigned i=0;
  52. ForEachItemIn(s, serviceArray)
  53. {
  54. const char* service = serviceArray.item(s);
  55. if (service && strieq(service, "UpdateLOG"))
  56. services[i++] = LGSTUpdateLOG;
  57. else if (service && strieq(service, "GetTransactionSeed"))
  58. services[i++] = LGSTGetTransactionSeed;
  59. else if (service && strieq(service, "GetTransactionID"))
  60. services[i++] = LGSTGetTransactionID;
  61. }
  62. services[i] = LGSTterm;
  63. logAgent.setown(_logAgent);
  64. maxLogQueueLength = _cfg->getPropInt(PropMaxLogQueueLength, MaxLogQueueLength);
  65. signalGrowingQueueAt = _cfg->getPropInt(PropQueueSizeSignal, QueueSizeSignal);
  66. maxLogRetries = _cfg->getPropInt(PropMaxTriesRS, DefaultMaxTriesRS);
  67. ensureFailSafe = _cfg->getPropBool(PropFailSafe);
  68. if(ensureFailSafe)
  69. {
  70. const char * logsDir = _cfg->queryProp(PropFailSafeLogsDir);
  71. if (!logsDir || !*logsDir)
  72. logsDir = "./FailSafeLogs";
  73. logFailSafe.setown(createFailSafeLogger(_service, _agentName, logsDir));
  74. }
  75. }
  76. CLogThread::~CLogThread()
  77. {
  78. ESPLOG(LogMax, "CLogThread::~CLogThread()");
  79. }
  80. void CLogThread::start()
  81. {
  82. Thread::start();
  83. }
  84. int CLogThread::run()
  85. {
  86. Link();
  87. if(logFailSafe.get())
  88. checkPendingLogs(false);
  89. while(!stopping)
  90. {
  91. m_sem.wait(UPDATELOGTHREADWAITINGTIME);
  92. sendLog();
  93. if(logFailSafe.get())
  94. {
  95. checkPendingLogs(true);
  96. checkRollOver();
  97. }
  98. }
  99. Release();
  100. return 0;
  101. }
  102. void CLogThread::stop()
  103. {
  104. try
  105. {
  106. CriticalBlock b(logQueueCrit);
  107. if (!logQueue.ordinality() && logFailSafe.get())
  108. logFailSafe->RollCurrentLog();
  109. //If logQueue is not empty, the log files are rolled over so that queued jobs can be read
  110. //when the CLogThread is restarted.
  111. }
  112. catch(...)
  113. {
  114. DBGLOG("Exception");
  115. }
  116. stopping = true;
  117. m_sem.signal();
  118. join();
  119. }
  120. bool CLogThread::queueLog(IEspUpdateLogRequest* logRequest)
  121. {
  122. if (!logRequest)
  123. return false;
  124. Owned<IEspUpdateLogRequestWrap> logRequestWrap = new CUpdateLogRequestWrap(NULL, logRequest->getOption(), logRequest->getLogContent());
  125. return enqueue(logRequestWrap);
  126. }
  127. bool CLogThread::queueLog(IEspUpdateLogRequestWrap* logRequest)
  128. {
  129. unsigned startTime = (getEspLogLevel()>=LogNormal) ? msTick() : 0;
  130. logAgent->filterLogContent(logRequest);
  131. ESPLOG(LogNormal, "LThread:filterLog: %dms\n", msTick() - startTime);
  132. return enqueue(logRequest);
  133. }
  134. bool CLogThread::enqueue(IEspUpdateLogRequestWrap* logRequest)
  135. {
  136. if (logFailSafe.get())
  137. {
  138. StringBuffer GUID, reqBuf;
  139. unsigned startTime = (getEspLogLevel()>=LogNormal) ? msTick() : 0;
  140. logFailSafe->GenerateGUID(GUID, NULL);
  141. logRequest->setGUID(GUID.str());
  142. if (serializeLogRequestContent(logRequest, reqBuf))
  143. logFailSafe->Add(GUID, reqBuf.str());
  144. ESPLOG(LogNormal, "LThread:addToFailSafe: %dms\n", msTick() - startTime);
  145. }
  146. writeJobQueue(logRequest);
  147. m_sem.signal();
  148. return true;
  149. }
  150. void CLogThread::sendLog()
  151. {
  152. try
  153. {
  154. if(stopping)
  155. return;
  156. int recSend = 0;
  157. while(true)
  158. {
  159. IEspUpdateLogRequestWrap* logRequest = readJobQueue();
  160. if (!logRequest)
  161. break;
  162. const char* GUID= logRequest->getGUID();
  163. if ((!GUID || !*GUID) && ensureFailSafe && logFailSafe.get())
  164. continue;
  165. try
  166. {
  167. unsigned startTime = (getEspLogLevel()>=LogNormal) ? msTick() : 0;
  168. Owned<IEspUpdateLogResponse> logResponse = createUpdateLogResponse();
  169. logAgent->updateLog(*logRequest, *logResponse);
  170. if (!logResponse)
  171. throw MakeStringException(EspLoggingErrors::UpdateLogFailed, "no response");
  172. if (logResponse->getStatusCode())
  173. {
  174. const char* statusMessage = logResponse->getStatusMessage();
  175. if(statusMessage && *statusMessage)
  176. throw MakeStringException(EspLoggingErrors::UpdateLogFailed, "%s", statusMessage);
  177. else
  178. throw MakeStringException(EspLoggingErrors::UpdateLogFailed, "Unknown error");
  179. }
  180. ESPLOG(LogNormal, "LThread:updateLog: %dms\n", msTick() - startTime);
  181. if(ensureFailSafe && logFailSafe.get())
  182. {
  183. unsigned startTime1 = (getEspLogLevel()>=LogNormal) ? msTick() : 0;
  184. logFailSafe->AddACK(GUID);
  185. ESPLOG(LogNormal, "LThread:AddACK: %dms\n", msTick() - startTime1);
  186. }
  187. logRequest->Release();//Make sure that no data (such as GUID) is needed before releasing the logRequest.
  188. }
  189. catch(IException* e)
  190. {
  191. StringBuffer errorStr, errorMessage;
  192. errorMessage.appendf("Failed to update log for %s: error code %d, error message %s", GUID, e->errorCode(), e->errorMessage(errorStr).str());
  193. e->Release();
  194. bool willRetry = false;
  195. if (maxLogRetries != 0)
  196. {
  197. unsigned retry = logRequest->incrementRetryCount();
  198. if (retry > maxLogRetries)
  199. errorMessage.append(" Max logging retries exceeded.");
  200. else
  201. {
  202. willRetry = true;
  203. writeJobQueue(logRequest);
  204. errorMessage.appendf(" Adding back to logging queue for retrying %d.", retry);
  205. }
  206. }
  207. if (!willRetry)
  208. {
  209. if(ensureFailSafe && logFailSafe.get())
  210. logFailSafe->AddACK(GUID);
  211. logRequest->Release();
  212. }
  213. ERRLOG("%s", errorMessage.str());
  214. }
  215. }
  216. }
  217. catch(IException* e)
  218. {
  219. StringBuffer errorStr, errorMessage;
  220. errorMessage.append("Exception thrown within update log thread: error code ").append(e->errorCode()).append(", error message ").append(e->errorMessage(errorStr));
  221. ERRLOG("%s", errorMessage.str());
  222. e->Release();
  223. }
  224. catch(...)
  225. {
  226. ERRLOG("Unknown exception thrown within update log thread");
  227. }
  228. return;
  229. }
  230. //////////////////////////FailSafe////////////////////////////
  231. void CLogThread::checkRollOver()
  232. {
  233. try
  234. {
  235. bool bRollover = false;
  236. time_t tNow;
  237. time(&tNow);
  238. struct tm ltNow;
  239. localtime_r(&tNow, &ltNow);
  240. if ((ltNow.tm_year != m_startTime.tm_year || ltNow.tm_yday != m_startTime.tm_yday))
  241. {
  242. bRollover = true;
  243. localtime_r(&tNow, &m_startTime); // reset the start time for next rollover check
  244. }
  245. if (!bRollover)
  246. return;
  247. //Rename .log files to .old files
  248. logFailSafe->SafeRollover();
  249. CriticalBlock b(logQueueCrit);
  250. //Check and add queued requests to tank(.log) files
  251. unsigned numNewArrivals = logQueue.ordinality();
  252. if(numNewArrivals <= 0)
  253. return;
  254. ESPLOG(LogMax, "writing %d requests in the queue to the rolled over tank file.", numNewArrivals);
  255. unsigned startTime = (getEspLogLevel()>=LogNormal) ? msTick() : 0;
  256. for(unsigned i = 0; i < numNewArrivals; i++)
  257. {
  258. IInterface* pRequest = logQueue.item(i);
  259. if (!pRequest)
  260. continue;
  261. IEspUpdateLogRequestWrap* pEspRequest = dynamic_cast<IEspUpdateLogRequestWrap*>(pRequest);
  262. if(!pEspRequest)
  263. continue;
  264. StringBuffer reqBuf;
  265. const char* GUID = pEspRequest->getGUID();
  266. if(GUID && *GUID && serializeLogRequestContent(pEspRequest, reqBuf))
  267. logFailSafe->Add(GUID, reqBuf.str());
  268. }
  269. ESPLOG(LogNormal, "LThread:AddFailSafe: %dms\n", msTick() - startTime);
  270. }
  271. catch(IException* Ex)
  272. {
  273. StringBuffer str;
  274. Ex->errorMessage(str);
  275. ERRLOG("Exception thrown during tank file rollover: %s",str.str());
  276. Ex->Release();
  277. }
  278. catch(...)
  279. {
  280. ERRLOG("Unknown exception thrown during tank file rollover.");
  281. }
  282. }
  283. unsigned CLogThread::serializeLogRequestContent(IEspUpdateLogRequestWrap* pRequest, StringBuffer& logData)
  284. {
  285. const char* GUID = pRequest->getGUID();
  286. const char* option = pRequest->getOption();
  287. const char* logRequest = pRequest->getUpdateLogRequest();
  288. if (GUID && *GUID)
  289. logData.append("<GUID>").append(GUID).append("</GUID>");
  290. if (option && *option)
  291. logData.append("<Option>").append(option).append("</Option>");
  292. if (logRequest && *logRequest)
  293. {
  294. StringBuffer buffer;
  295. JBASE64_Encode(logRequest, strlen(logRequest), buffer);
  296. logData.append("<LogRequest>").append(buffer.str()).append("</LogRequest>");
  297. }
  298. return logData.length();
  299. }
  300. void CLogThread::checkPendingLogs(bool bOneRecOnly)
  301. {
  302. try
  303. {
  304. bool queueLogError = false;
  305. bool bFirst = true;
  306. StringBuffer GUID, logData;
  307. while (logFailSafe->PopPendingLogRecord(GUID, logData))
  308. {
  309. if (bFirst && !bOneRecOnly)
  310. {
  311. DBGLOG("We have old logs!. Will now try and recover the lost log messages");
  312. bFirst = false;
  313. }
  314. Owned<IEspUpdateLogRequestWrap> logRequest = unserializeLogRequestContent(logData.str());
  315. if (!logRequest)
  316. ERRLOG("checkPendingLogs: failed to unserialize: %s", logData.str());
  317. else if (!enqueue(logRequest))
  318. {
  319. ERRLOG("checkPendingLogs: failed to add a log request to queue");
  320. queueLogError=true;
  321. }
  322. if (bOneRecOnly)
  323. break;
  324. }
  325. //if everything went ok then we should be able to rollover the old logs.
  326. if (!queueLogError && !bOneRecOnly)
  327. logFailSafe->RolloverAllLogs();
  328. }
  329. catch(IException* ex)
  330. {
  331. StringBuffer errorStr;
  332. ex->errorMessage(errorStr);
  333. ERRLOG("CheckPendingLogs: %s:" ,errorStr.str());
  334. ex->Release();
  335. }
  336. catch(...)
  337. {
  338. ERRLOG("Unknown exception thrown in CheckPendingLogs");
  339. }
  340. }
  341. IEspUpdateLogRequestWrap* CLogThread::unserializeLogRequestContent(const char* logData)
  342. {
  343. if (!logData && *logData)
  344. return NULL;
  345. Owned<IPropertyTree> pLogTree = createPTreeFromXMLString(logData);
  346. if (!pLogTree)
  347. return NULL;
  348. const char* guid = pLogTree->queryProp("GUID");
  349. const char* opt = pLogTree->queryProp("Option");
  350. const char* logRequest = pLogTree->queryProp("LogRequest");
  351. if (!logRequest || !*logRequest)
  352. return NULL;
  353. StringBuffer buffer;
  354. JBASE64_Decode(logRequest, buffer);
  355. return new CUpdateLogRequestWrap(guid, opt, buffer.str());
  356. };
  357. void CLogThread::writeJobQueue(IEspUpdateLogRequestWrap* jobToWrite)
  358. {
  359. if (jobToWrite)
  360. {
  361. unsigned startTime = (getEspLogLevel()>=LogNormal) ? msTick() : 0;
  362. CriticalBlock b(logQueueCrit);
  363. ESPLOG(LogNormal, "LThread:waitWQ: %dms\n", msTick() - startTime);
  364. int QueueSize = logQueue.ordinality();
  365. if(QueueSize > maxLogQueueLength)
  366. ERRLOG("LOGGING QUEUE SIZE %d EXECEEDED MaxLogQueueLength %d, check the logging server.",QueueSize, maxLogQueueLength);
  367. if(QueueSize!=0 && QueueSize % signalGrowingQueueAt == 0)
  368. ERRLOG("Logging Queue at %d records. Check the logging server.",QueueSize);
  369. logQueue.enqueue(LINK(jobToWrite));
  370. }
  371. }
  372. IEspUpdateLogRequestWrap* CLogThread::readJobQueue()
  373. {
  374. unsigned startTime = (getEspLogLevel()>=LogNormal) ? msTick() : 0;
  375. CriticalBlock b(logQueueCrit);
  376. ESPLOG(LogNormal, "LThread:waitRQ: %dms\n", msTick() - startTime);
  377. return (IEspUpdateLogRequestWrap*)logQueue.dequeue();
  378. }