swapnode.cpp 43 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223
  1. /*##############################################################################
  2. Copyright (C) 2011 HPCC Systems.
  3. All rights reserved. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU Affero General Public License as
  5. published by the Free Software Foundation, either version 3 of the
  6. License, or (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU Affero General Public License for more details.
  11. You should have received a copy of the GNU Affero General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ############################################################################## */
  14. #include "platform.h"
  15. #include "thirdparty.h"
  16. #include "jlib.hpp"
  17. #include "jfile.hpp"
  18. #include "jptree.hpp"
  19. #include "jprop.hpp"
  20. #include "jmisc.hpp"
  21. #include "mpbase.hpp"
  22. #include "daclient.hpp"
  23. #include "dadfs.hpp"
  24. #include "dafdesc.hpp"
  25. #include "dasds.hpp"
  26. #include "danqs.hpp"
  27. #include "dalienv.hpp"
  28. #include "rmtfile.hpp"
  29. #include "rmtsmtp.hpp"
  30. #ifndef _ESP
  31. #include "dautils.hpp"
  32. #include "workunit.hpp"
  33. #else
  34. #include "swapnodemain.hpp"
  35. #endif
  36. #define SDS_LOCK_TIMEOUT 30000
  37. #define SWAPNODE_RETRY_TIME (1000*60*60*1) // 1hr
  38. #ifdef _DEBUG
  39. #ifdef NIGEL_TESTING
  40. #define FILES_WRITE_PREFIX "test_"
  41. #else
  42. #define FILES_WRITE_PREFIX ""
  43. #endif
  44. #else
  45. #define FILES_WRITE_PREFIX ""
  46. #endif
  47. #ifndef _ESP
  48. static void doAutoSwapNode(IRemoteConnection *connEnv,IRemoteConnection *connFiles,IPropertyTree *options,bool doswap);
  49. static const LogMsgJobInfo swapnodeJob(UnknownJob, UnknownUser);
  50. static void autoRestart(IPropertyTree *options);
  51. #endif
  52. static IRemoteConnection* GetRemoteLock(const char* path, unsigned int mode, bool nonfatalinuse)
  53. {
  54. // this code not nice - could do with rewrite when time permits!
  55. IRemoteConnection* pRemoteConnection = NULL;
  56. try
  57. {
  58. PROGLOG("Getting a lock on %s ...", path);
  59. pRemoteConnection = querySDS().connect(path, myProcessSession(), mode, SDS_LOCK_TIMEOUT);
  60. }
  61. catch (IException* e)
  62. {
  63. StringBuffer sErrMsg;
  64. e->errorMessage(sErrMsg);
  65. e->Release();
  66. /*typical error message when lock fails is as follows:
  67. SDS: Lock timeout
  68. SDS Reply Error : SDS: Lock timeout
  69. Failed to establish lock to NewEnvironment/
  70. Existing lock status: Locks on path: /NewEnvironment/
  71. Endpoint |SessionId |ConnectionId |mode
  72. 172.16.48.175:7254 |c00000038 |c0000003b |653
  73. */
  74. const char* pattern = "Failed to establish lock to ";
  75. const char* match = strstr(sErrMsg.str(), pattern);
  76. if (match)
  77. {
  78. match += strlen(pattern);
  79. const char* eol = strchr(match, '\n');
  80. StringBuffer path;
  81. path.append(eol - match - 1, match);
  82. //if we can extract IP address of computer holding the lock then
  83. //show a customized message.
  84. //
  85. //Retrieve IP address of computer holding the lock...
  86. char achHost[128] = "";
  87. const char* p = strstr(sErrMsg.str(), "\n\n");
  88. if (p && *(p+=2))
  89. {
  90. const char* q = strchr(p, ':');
  91. if (q)
  92. {
  93. const int len = q-p;
  94. strncpy(achHost, p, len);
  95. achHost[len] = '\0';
  96. }
  97. }
  98. StringBuffer sMsg;
  99. sMsg.appendf("Failed to get a lock on /%s", path.str());
  100. if (achHost[0])
  101. sMsg.appendf(" because it is locked by computer %s.", achHost);
  102. else
  103. sMsg.append(":\n\n").append(sErrMsg);
  104. if (nonfatalinuse) {
  105. WARNLOG("%s",sMsg.str());
  106. return NULL;
  107. }
  108. throw ::MakeStringException(-1, "%s", sMsg.str());
  109. }
  110. else
  111. throw ::MakeStringException(-1, "%s", sErrMsg.str());
  112. }
  113. return pRemoteConnection;
  114. }
  115. static bool ensureThorIsDown(const char* cluster, bool nofail, bool wait)
  116. {
  117. bool retry = false;
  118. do {
  119. Owned<IRemoteConnection> pStatus = querySDS().connect("/Status/Servers", myProcessSession(), RTM_NONE, SDS_LOCK_TIMEOUT);
  120. Owned<IPropertyTreeIterator> it = pStatus->queryRoot()->getElements("Server[@name='ThorMaster']");
  121. retry = false;
  122. ForEach(*it) {
  123. IPropertyTree* pServer = &it->query();
  124. if (pServer->hasProp("@cluster") && !strcmp(pServer->queryProp("@cluster"), cluster)) {
  125. if (nofail) {
  126. WARNLOG("A Thor on cluster %s is still active", cluster);
  127. if (!wait)
  128. return false;
  129. Sleep(1000*10);
  130. PROGLOG("Retrying...");
  131. retry = true;
  132. break;
  133. }
  134. throw MakeStringException(-1, "A Thor cluster node swap requires the cluster to be offline. Please stop the Thor cluster '%s' and try again.", cluster);
  135. }
  136. }
  137. } while (retry);
  138. return true;
  139. }
  140. static bool doEnvironment(IPropertyTree* root, const char* clustername, const char* oldip, const char* newip, unsigned nodenum, bool& bNewMachineIsLinux)
  141. {
  142. IPropertyTree* pHardware = root->queryPropTree("Hardware");
  143. IPropertyTree* oldmachine = pHardware->queryPropTree(StringBuffer("Computer[@netAddress=\"").append(oldip).append("\"]").str());
  144. if(!oldmachine) {
  145. ERRLOG("Could not find computer with ip=%s", oldip);
  146. return false;
  147. }
  148. IPropertyTree* newmachine = pHardware->queryPropTree(StringBuffer("Computer[@netAddress=\"").append(newip).append("\"]").str());
  149. if(!newmachine) {
  150. ERRLOG("Could not find computer with ip=%s", newip);
  151. return false;
  152. }
  153. StringBuffer xpath;
  154. //determine if the new machine is linux so we can properly update its slaves file
  155. const char* newMachineType = newmachine->queryProp("@computerType");
  156. xpath.clear().appendf("ComputerType[@name='%s']", newMachineType);
  157. IPropertyTree* newMachineTypeNode = pHardware->queryPropTree( xpath.str() );
  158. if (!newMachineTypeNode) {
  159. ERRLOG("computer type '%s' of the new slave is not defined!", newMachineType);
  160. return false;
  161. }
  162. const char* os = newMachineTypeNode->queryProp("@opSys");
  163. bNewMachineIsLinux = os && !strcmp(os, "linux");
  164. DBGLOG("NewMachine=%s (%s)",newmachine->queryProp("@name"),bNewMachineIsLinux?"linux":"windows");
  165. DBGLOG("OldMachine=%s",oldmachine->queryProp("@name"));
  166. IPropertyTree* pSoftware = root->queryPropTree("Software");
  167. if(!pSoftware) {
  168. ERRLOG("Could not find /Software!");
  169. return false;
  170. }
  171. // look for all ThorCluster entries with correct nodegroup (needed for multithor)
  172. Owned<IPropertyTreeIterator> clusters = pSoftware->getElements("ThorCluster");
  173. ForEach(*clusters) {
  174. IPropertyTree &cluster = clusters->query();
  175. const char *groupname = cluster.queryProp("@nodeGroup");
  176. if (!groupname||!*groupname)
  177. groupname = cluster.queryProp("@name");
  178. if (strcmp(groupname,clustername)!=0)
  179. continue;
  180. xpath.clear().appendf("ThorSlaveProcess[@computer='%s']", oldmachine->queryProp("@name"));
  181. IPropertyTree* slave = cluster.queryPropTree(xpath.str());
  182. if(!slave) {
  183. ERRLOG("Could not find slave %s in thor %s", oldmachine->queryProp("@name"), cluster.queryProp("@name"));
  184. return false;
  185. }
  186. xpath.clear().appendf("ThorSlaveProcess[@computer='%s']", newmachine->queryProp("@name"));
  187. if (cluster.queryPropTree(xpath.str())) {
  188. ERRLOG("This would duplicate slave %s in thor %s", oldmachine->queryProp("@name"), cluster.queryProp("@name"));
  189. return false;
  190. }
  191. StringBuffer sn;
  192. if (nodenum!=0) {
  193. sn.append('s').append(nodenum);
  194. if (strcmp(slave->queryProp("@name"),sn.str())!=0) {
  195. ERRLOG("Incorrect slave number %d for slave %s(%s) in thor %s",
  196. nodenum,oldmachine->queryProp("@name"), slave->queryProp("@name"), cluster.queryProp("@name"));
  197. return false;
  198. }
  199. }
  200. xpath.clear().appendf("ThorSpareProcess[@computer='%s']", newmachine->queryProp("@name"));
  201. IPropertyTree* spare = cluster.queryPropTree(xpath.str());
  202. if(spare) {
  203. const char *state = slave->queryProp("@state"); // not sure if anyone actuall sets this but bwd compat.
  204. DBGLOG("Removing Spare:%s%s%s",slave->queryProp("@name"),state?" with status: ":"",state?state:"");
  205. cluster.removeTree(spare);
  206. }
  207. slave->setProp("@computer",newmachine->queryProp("@name"));
  208. newmachine->setProp("@state","Unavailable");
  209. }
  210. return true;
  211. }
  212. static bool resolveComputerName(IPropertyTree *rootEnv,const char *name,IpAddress &ip)
  213. {
  214. StringBuffer query;
  215. query.appendf("Hardware/Computer[@name=\"%s\"]",name);
  216. Owned<IPropertyTree> machine = rootEnv->getPropTree(query.str());
  217. const char *node = machine?machine->queryProp("@netAddress"):NULL;
  218. if (!node||!*node)
  219. false;
  220. ip.ipset(node);
  221. return true;
  222. }
  223. class CfixDaliDFS
  224. {
  225. void writeSlavesFile(IPropertyTree* rootEnv, const char *newep, IPropertyTree& cluster, bool bNewMachineIsLinux)
  226. {
  227. Owned<INode> newnode = createINode(newep);
  228. const char *groupname = cluster.queryProp("@nodeGroup");
  229. if (!groupname||!*groupname)
  230. groupname = cluster.queryProp("@name");
  231. Owned<IGroup> grp = queryNamedGroupStore().lookup(groupname);
  232. if (!grp) {
  233. ERRLOG("writeSlavesFile: group not found for cluster %s",groupname);
  234. return;
  235. }
  236. if (!grp->isMember(newnode))
  237. return;
  238. PROGLOG("Writing slaves file for cluster %s",groupname);
  239. const char *computer = cluster.queryProp("@computer");
  240. if (!computer||!*computer) {
  241. ERRLOG("writeSlavesFile: cluster has no computer specified");
  242. return;
  243. }
  244. const char *dir = cluster.queryProp("@directory");
  245. if (!dir||!*dir) {
  246. ERRLOG("writeSlavesFile: cluster has no directory specified");
  247. return;
  248. }
  249. IpAddress masterip;
  250. if (!resolveComputerName(rootEnv,computer,masterip)) {
  251. ERRLOG("writeSlavesFile: cannot resolve thor master at %s",computer);
  252. return;
  253. }
  254. char sep = bNewMachineIsLinux?'/':'\\';
  255. StringBuffer filename;
  256. filename.append(sep).append(sep);
  257. masterip.getIpText(filename);
  258. if (dir&&*dir&&!isPathSepChar(*dir))
  259. filename.append(sep);
  260. while (dir&&*dir) {
  261. if (isPathSepChar(*dir))
  262. filename.append(sep);
  263. else
  264. filename.append(*dir);
  265. dir++;
  266. }
  267. addPathSepChar(filename,sep);
  268. size32_t dirsz = filename.length();
  269. filename.append(FILES_WRITE_PREFIX "slaves");
  270. StringBuffer str;
  271. ForEachNodeInGroup(r,*grp) {
  272. grp->queryNode(r).endpoint().getUrlStr(str);
  273. if (!bNewMachineIsLinux)
  274. str.append('\r'); // not sure a good idea but consistent with deploy engine
  275. str.append('\n');
  276. }
  277. PROGLOG("Writing slaves to %s",filename.str());
  278. Owned<IFile> outfile = createIFile(filename.str());
  279. Owned<IFileIO> outfileio = outfile->open(IFOcreate);
  280. if (!outfileio)
  281. throw MakeStringException (-1,"Cannot create slaves file %s",filename.str());
  282. outfileio->write(0,str.length(),str.str());
  283. outfileio.clear();
  284. outfile.clear();
  285. str.clear();
  286. Owned<IPropertyTreeIterator> spares = cluster.getElements("ThorSpareProcess");
  287. ForEach(*spares) {
  288. computer = spares->query().queryProp("@computer");
  289. if (!computer||!*computer) {
  290. WARNLOG("writeSlavesFile: spare has no computer specified");
  291. continue;
  292. }
  293. IpAddress nodeip;
  294. if (!resolveComputerName(rootEnv,computer,nodeip)) {
  295. WARNLOG("writeSlavesFile: cannot resolve spare at %s",computer);
  296. str.append(computer);
  297. }
  298. else
  299. nodeip.getIpText(str);
  300. if (!bNewMachineIsLinux)
  301. str.append('\r'); // not sure a good idea but consistent with deploy engine
  302. str.append('\n');
  303. }
  304. filename.setLength(dirsz);
  305. filename.append(FILES_WRITE_PREFIX "spares");
  306. PROGLOG("Writing spares to %s",filename.str());
  307. outfile.setown(createIFile(filename.str()));
  308. outfileio.setown(outfile->open(IFOcreate));
  309. if (!outfileio)
  310. throw MakeStringException (-1,"Cannot create spares file %s",filename.str());
  311. outfileio->write(0,str.length(),str.str());
  312. }
  313. public:
  314. void doThorSlavesFiles(const char *newip, IPropertyTree* rootEnv, bool bNewMachineIsLinux)
  315. { // recreates DFS Groups (bit over the top for this usage, but effective)
  316. Owned<IPropertyTreeIterator> clusters= rootEnv->getElements("Software/ThorCluster");
  317. ForEach(*clusters) {
  318. IPropertyTree &cluster = clusters->query();
  319. writeSlavesFile(rootEnv, newip, cluster, bNewMachineIsLinux);
  320. }
  321. }
  322. bool doFiles(IPropertyTree* filesRoot, const char* thor, const char* oldip, const char* newip,unsigned partno)
  323. {
  324. class cfilescan
  325. {
  326. void processScopes(IPropertyTree &root,StringBuffer &name)
  327. {
  328. size32_t ns = name.length();
  329. if (ns)
  330. name.append("::");
  331. size32_t ns2 = name.length();
  332. Owned<IPropertyTreeIterator> iter = root.getElements("Scope");
  333. if (iter->first()) {
  334. do {
  335. IPropertyTree &scope = iter->query();
  336. name.append(scope.queryProp("@name"));
  337. processScopes(scope,name);
  338. name.setLength(ns2);
  339. } while (iter->next());
  340. }
  341. processFiles(root,name);
  342. name.setLength(ns);
  343. }
  344. void processFiles(IPropertyTree &root,StringBuffer &name)
  345. {
  346. size32_t ns = name.length();
  347. Owned<IPropertyTreeIterator> iter = root.getElements("File");
  348. if (iter->first()) {
  349. do {
  350. IPropertyTree &file = iter->query();
  351. name.append(file.queryProp("@name"));
  352. processFile(file,name);
  353. name.setLength(ns);
  354. } while (iter->next());
  355. }
  356. }
  357. void processFile(IPropertyTree &file,StringBuffer &name)
  358. {
  359. Owned<IPropertyTreeIterator> iter = file.getElements(frompart.str());
  360. if (iter->first())
  361. {
  362. loop
  363. {
  364. IPropertyTree &item = iter->query();
  365. if (!partno || item.getPropInt("@num",0)==partno) {
  366. PROGLOG("Processing file %s",name.str());
  367. item.setProp("@node",to);
  368. }
  369. else
  370. WARNLOG("ignoring node on file %s parts don't match (%d,%d)",name.str(),item.getPropInt("@num",0),partno);
  371. if (!iter->next())
  372. break;
  373. }
  374. }
  375. }
  376. public:
  377. void scan(IPropertyTree *sroot)
  378. {
  379. StringBuffer name;
  380. processScopes(*sroot,name);
  381. }
  382. StringBuffer frompart;
  383. const char* to;
  384. unsigned partno;
  385. } filescan;
  386. filescan.frompart.append("Part[@node=\"").append(oldip).append("\"]");
  387. filescan.to = newip;
  388. filescan.partno = partno;
  389. filescan.scan(filesRoot);
  390. return true;
  391. }
  392. };
  393. static bool doSingleSwapNode(IRemoteConnection *connEnv,IRemoteConnection *connFiles,const char* cluster,const char* oldip,const char* newip,unsigned nodenum,IPropertyTree *info,const char *timechecked)
  394. {
  395. IPropertyTree* rootEnv = connEnv->queryRoot();
  396. IPropertyTree* rootFiles = connFiles->queryRoot();
  397. bool bNewMachineIsLinux;
  398. if (doEnvironment(rootEnv, cluster,oldip,newip,nodenum, bNewMachineIsLinux)) {
  399. CfixDaliDFS fixdfs;
  400. fixdfs.doFiles(rootFiles, cluster,oldip,newip,nodenum);
  401. // no turning back now
  402. connEnv->commit();
  403. connFiles->commit();
  404. SocketEndpoint ipfrom(oldip);
  405. SocketEndpoint ipto(newip);
  406. queryNamedGroupStore().swapNode(ipfrom,ipto);
  407. fixdfs.doThorSlavesFiles(newip,connEnv->queryRoot(), bNewMachineIsLinux); // must be done after doEnvironment
  408. if (info) {
  409. StringBuffer times(timechecked);
  410. if (times.length()==0) {
  411. CDateTime dt;
  412. dt.setNow();
  413. dt.getString(times);
  414. }
  415. StringBuffer xpath;
  416. // TBD tie up with bad node in auto?
  417. IPropertyTree *swap = info->addPropTree("Swap",createPTree("Swap"));
  418. swap->setProp("@inNetAddress",newip);
  419. swap->setProp("@outNetAddress",oldip);
  420. swap->setProp("@time",times.str());
  421. swap->setPropInt("@rank",nodenum-1);
  422. }
  423. return true;
  424. }
  425. return false;
  426. }
  427. static bool doSwapNode(IPropertyTree *options,bool doswap,const char* cluster,const char* oldip,const char* newip,unsigned nodenum, bool nofail)
  428. {
  429. Owned<IRemoteConnection> connNewEnv; // only used as lock (apparently)
  430. Owned<IRemoteConnection> connEnv;
  431. Owned<IRemoteConnection> connFiles;
  432. try {
  433. const unsigned int mode = RTM_CREATE | RTM_CREATE_QUERY | RTM_LOCK_READ | RTM_LOCK_WRITE | RTM_DELETE_ON_DISCONNECT;
  434. const unsigned int mode2 = RTM_LOCK_READ; // only lock for read as NewEnvironment will protect against configenv
  435. connNewEnv.setown(GetRemoteLock("/NewEnvironment",mode,nofail));
  436. if (!connNewEnv)
  437. return false;
  438. connEnv.setown(GetRemoteLock("/Environment",mode2,nofail));
  439. if (!connEnv)
  440. return false;
  441. if (doswap) {
  442. connFiles.setown(GetRemoteLock("/Files",mode2, nofail));
  443. if (!connFiles)
  444. return false;
  445. }
  446. #ifndef _ESP
  447. if (options) {
  448. doAutoSwapNode(connEnv,connFiles,options,doswap);
  449. autoRestart(options);
  450. }
  451. else
  452. #endif
  453. {
  454. ensureThorIsDown(cluster,false,false);
  455. Owned<IPropertyTree> info;
  456. #ifndef _ESP
  457. Owned<IPropertyTree> opt = createPTree(ipt_caseInsensitive);
  458. opt->setProp("@nodeGroup",cluster);
  459. Owned<IGroup> grp;
  460. Owned<IRemoteConnection> connSwapNode;
  461. StringAttr grpname;
  462. getSwapNodeInfo(opt,grpname,grp,connSwapNode,info,true);
  463. #endif
  464. doSingleSwapNode(connEnv,connFiles,cluster,oldip,newip,nodenum,info,NULL);
  465. }
  466. }
  467. catch (IException *) {
  468. if (connEnv)
  469. connEnv->rollback();
  470. if (connFiles)
  471. connFiles->rollback();
  472. throw;
  473. }
  474. PROGLOG("SwapNode finished");
  475. return true;
  476. }
  477. void SwapNode(const char* cluster,const char* oldip,const char* newip,unsigned nodenum)
  478. {
  479. PROGLOG("SWAPNODE(%s,%s,%s,%d) starting",cluster,oldip,newip,nodenum);
  480. doSwapNode(NULL,true,cluster,oldip,newip,nodenum,false);
  481. }
  482. #ifndef _ESP
  483. bool WuResubmit(const char *wuid)
  484. {
  485. Owned<IWorkUnitFactory> factory = getWorkUnitFactory();
  486. Owned<IWorkUnit> wu = factory->updateWorkUnit(wuid);
  487. if (!wu) {
  488. ERRLOG("WuResubmit(%s): could not find workunit",wuid);
  489. return false;
  490. }
  491. if (wu->getState()!=WUStateFailed) {
  492. SCMStringBuffer state;
  493. wu->getStateDesc(state);
  494. ERRLOG("WuResubmit(%s): could not resubmit as workunit state is '%s'",wuid,state.str());
  495. return false;
  496. }
  497. SCMStringBuffer token;
  498. wu->getSecurityToken(token);
  499. SCMStringBuffer user;
  500. SCMStringBuffer password;
  501. extractToken(token.str(), wuid, user, password);
  502. wu->resetWorkflow();
  503. wu->setState(WUStateSubmitted);
  504. wu->commit();
  505. wu.clear();
  506. submitWorkUnit(wuid,user.str(),password.str());
  507. PROGLOG("WuResubmit(%s): resubmitted",wuid);
  508. return true;
  509. }
  510. void swapNodeHistory(IPropertyTree *options,unsigned days,StringBuffer *out)
  511. {
  512. Owned<IGroup> grp;
  513. Owned<IRemoteConnection> connSwapNode;
  514. Owned<IPropertyTree> info;
  515. StringAttr grpname;
  516. if (!getSwapNodeInfo(options,grpname,grp,connSwapNode,info,true)) {
  517. if (out)
  518. out->append("No swapnode info\n");
  519. else
  520. ERRLOG("No swapnode info");
  521. return;
  522. }
  523. StringBuffer line;
  524. CDateTime tt;
  525. CDateTime cutoff;
  526. if (days) {
  527. cutoff.setNow();
  528. cutoff.adjustTime(-60*24*(int)days);
  529. }
  530. unsigned i=0;
  531. if (out)
  532. out->append("Failure, Time, NodeNum, NodeIp, ErrCode, Error Message\n------------------------------------------------------\n");
  533. else {
  534. PROGLOG("Failure, Time, NodeNum, NodeIp, ErrCode, Error Message");
  535. PROGLOG("------------------------------------------------------");
  536. }
  537. Owned<IPropertyTreeIterator> it1 = info->getElements("BadNode");
  538. ForEach(*it1) {
  539. IPropertyTree &badnode = it1->query();
  540. const char *ts = badnode.queryProp("@time");
  541. if (!ts)
  542. continue;
  543. if (days) {
  544. tt.setString(ts);
  545. if (cutoff.compare(tt)>0)
  546. continue;
  547. }
  548. line.clear().append(++i).append(", ");
  549. line.append(ts).append(", ").append(badnode.getPropInt("@rank",-1)+1).append(", ");
  550. badnode.getProp("@netAddress",line);
  551. line.append(", ").append(badnode.getPropInt("@code")).append(", \"");
  552. badnode.getProp(NULL,line);
  553. line.append('\"');
  554. if (out)
  555. out->append(line).append('\n');
  556. else
  557. PROGLOG("%s",line.str());
  558. }
  559. if (out)
  560. out->append("\nSwapped, Time, NodeNum, OutIp, InIp\n-----------------------------------\n");
  561. else {
  562. PROGLOG("%s", "");
  563. PROGLOG("Swapped, Time, NodeNum, OutIp, InIp");
  564. PROGLOG("-----------------------------------");
  565. }
  566. i = 0;
  567. Owned<IPropertyTreeIterator> it2 = info->getElements("Swap");
  568. ForEach(*it2) {
  569. IPropertyTree &swappednode = it2->query();
  570. const char *ts = swappednode.queryProp("@time");
  571. if (!ts)
  572. continue;
  573. if (days) {
  574. tt.setString(ts);
  575. if (cutoff.compare(tt)>0)
  576. continue;
  577. }
  578. line.clear().append(++i).append(", ");
  579. swappednode.getProp("@time",line);
  580. line.append(", ").append(swappednode.getPropInt("@rank",-1)+1).append(", ");
  581. swappednode.getProp("@outNetAddress",line);
  582. line.append(", ");
  583. swappednode.getProp("@inNetAddress",line);
  584. if (out)
  585. out->append(line.str()).append('\n');
  586. else
  587. PROGLOG("%s",line.str());
  588. }
  589. }
  590. bool checkIfNodeInUse(IPropertyTree *root, IpAddress &ip, bool includespares, StringBuffer &clustname)
  591. {
  592. SocketEndpoint ep(0,ip);
  593. IPropertyTree* pSoftware = root->queryPropTree("Software");
  594. if(!pSoftware)
  595. throw MakeStringException(-1,"Could not find /Environment/Software!");
  596. // look for all ThorCluster entries with correct nodegroup (needed for multithor)
  597. StringBuffer endpoint;
  598. Owned<IPropertyTreeIterator> clusters = pSoftware->getElements("ThorCluster");
  599. StringBuffer xpath;
  600. ForEach(*clusters) {
  601. IPropertyTree &cluster = clusters->query();
  602. const char *groupname = cluster.queryProp("@nodeGroup");
  603. if (!groupname||!*groupname)
  604. groupname = cluster.queryProp("@name");
  605. Owned<IGroup> grp = queryNamedGroupStore().lookup(groupname);
  606. if (!grp) {
  607. ERRLOG("writeSlavesFile: group not found for cluster %s",groupname);
  608. continue;
  609. }
  610. if ((int)grp->rank(ep)>=0) {
  611. clustname.append(groupname);
  612. return true;
  613. }
  614. if (!includespares)
  615. continue;
  616. Owned<IPropertyTreeIterator> spares = cluster.getElements("ThorSpareProcess");
  617. ForEach(*spares) {
  618. const char *computer = spares->query().queryProp("@computer");
  619. if (!computer||!*computer) {
  620. WARNLOG("checkIfNodeInUse: spare has no computer specified");
  621. continue;
  622. }
  623. IpAddress nodeip;
  624. if (!resolveComputerName(root,computer,nodeip)) {
  625. WARNLOG("checkIfNodeInUse: cannot resolve spare at %s",computer);
  626. continue;
  627. }
  628. if (nodeip.ipequals(ip)) {
  629. clustname.append(groupname).append(" spares");
  630. return true;
  631. }
  632. }
  633. }
  634. return false;
  635. }
  636. void swappedList(IPropertyTree *options,unsigned days, StringBuffer *out)
  637. {
  638. Owned<IRemoteConnection> conn = querySDS().connect("/Environment", myProcessSession(), 0, SDS_LOCK_TIMEOUT);
  639. if (!conn)
  640. return;
  641. Owned<IGroup> grp;
  642. Owned<IRemoteConnection> connSwapNode;
  643. Owned<IPropertyTree> info;
  644. StringAttr grpname;
  645. if (!getSwapNodeInfo(options,grpname,grp,connSwapNode,info,true)) { // should put out error if returns false
  646. return;
  647. }
  648. CDateTime tt;
  649. CDateTime cutoff;
  650. if (days) {
  651. cutoff.setNow();
  652. cutoff.adjustTime(-60*24*(int)days);
  653. }
  654. Owned<IPropertyTreeIterator> it2 = info->getElements("Swap");
  655. ForEach(*it2) {
  656. IPropertyTree &swappednode = it2->query();
  657. const char *ts = swappednode.queryProp("@time");
  658. if (!ts)
  659. continue;
  660. if (days) {
  661. tt.setString(ts);
  662. if (cutoff.compare(tt)>0)
  663. continue;
  664. }
  665. const char *ips = swappednode.queryProp("@outNetAddress");
  666. if (!ips||!*ips)
  667. continue;
  668. IpAddress ip(ips);
  669. StringBuffer clustname;
  670. if (checkIfNodeInUse(conn->queryRoot(),ip,true,clustname))
  671. continue; // ignore
  672. if (out)
  673. out->append(ips).append('\n');
  674. else
  675. PROGLOG("%s",ips);
  676. }
  677. }
  678. void EmailSwap(IPropertyTree *options, const char *msg, bool warn=false, bool sendswapped=false, bool sendhistory=false)
  679. {
  680. StringBuffer emailtarget;
  681. StringBuffer smtpserver;
  682. if (options->getProp("SwapNode/@EmailAddress",emailtarget)&&emailtarget.length()&&options->getProp("SwapNode/@EmailSMTPServer",smtpserver)&&smtpserver.length()) {
  683. const char * subject = options->queryProp("SwapNode/@EmailSubject");
  684. if (!subject)
  685. subject = "SWAPNODE automated email";
  686. StringBuffer msgs;
  687. if (!msg) {
  688. StringAttr grpname;
  689. grpname.set(options->queryProp("@nodeGroup"));
  690. if (grpname.isEmpty())
  691. grpname.set(options->queryProp("@name"));
  692. msgs.append("Swapnode command line, Cluster: ");
  693. msg = msgs.append(grpname).append('\n').str();
  694. }
  695. CDateTime dt;
  696. dt.setNow();
  697. StringBuffer out;
  698. dt.getString(out,true).append(": ").append(msg).append("\n\n");
  699. if (options->getPropBool("SwapNode/@EmailSwappedList")||sendswapped) {
  700. out.append("Currently swapped out nodes:\n");
  701. swappedList(options,0,&out);
  702. out.append('\n');
  703. }
  704. if (options->getPropBool("SwapNode/@EmailHistory")||sendhistory) {
  705. out.append("Swap history:\n");
  706. swapNodeHistory(options,0,&out);
  707. out.append('\n');
  708. }
  709. SocketEndpoint ep(smtpserver.str(),25);
  710. StringBuffer sender("swapnode@");
  711. queryHostIP().getIpText(sender);
  712. // add tbd
  713. StringBuffer ips;
  714. StringArray warnings;
  715. sendEmail(emailtarget.str(),subject,out.str(),ep.getIpText(ips).str(),ep.port,sender.str(),&warnings);
  716. ForEachItemIn(i,warnings)
  717. WARNLOG("SWAPNODE: %s",warnings.item(i));
  718. }
  719. else if (warn)
  720. WARNLOG("Either SwapNode/@EmailAddress or SwapNode/@EmailSMTPServer not set in thor.xml");
  721. }
  722. // SwapNode info
  723. //
  724. // SwapNode/
  725. // Thor [ @group, @timeChecked ]
  726. // BadNode [ @netAddress, @timeChecked, @time, @numTimes, @code, @rank, @ (msg)
  727. // Swap [ @inNetAddress, @outNetAddress, @time, @rank]
  728. // WorkUnit [ @id @time @resubmitted ]
  729. //time,nodenum,ip,code,errmsg
  730. //time,nodenum,swapout,swapin
  731. static void autoRestart(IPropertyTree *options)
  732. {
  733. // restarts any workunits that failed near to swap
  734. // let see if need resubmit any nodes
  735. StringArray toresubmit;
  736. if (options->getPropBool("SwapNode/@swapNodeRestartJob")) {
  737. Owned<IGroup> grp;
  738. Owned<IRemoteConnection> connSwapNode;
  739. Owned<IPropertyTree> info;
  740. StringAttr grpname;
  741. if (!getSwapNodeInfo(options,grpname,grp,connSwapNode,info,false)) { // should put out error if returns false
  742. PROGLOG("SWAPNODE(autoRestart) exiting");
  743. return;
  744. }
  745. CDateTime recent;
  746. recent.setNow();
  747. recent.adjustTime(-SWAPNODE_RETRY_TIME/(1000*60));
  748. Owned<IPropertyTreeIterator> it = info->getElements("WorkUnit");
  749. ForEach(*it) {
  750. IPropertyTree &wu = it->query();
  751. const char *wuid = wu.queryProp("@id");
  752. if (!wuid)
  753. continue;
  754. if (!wu.getPropBool("@resubmitted")) {
  755. // see if any swaps recently done
  756. const char *dt1s = wu.queryProp("@time");
  757. if (!dt1s||!*dt1s)
  758. continue;
  759. CDateTime dt1;
  760. dt1.setString(dt1s);
  761. dt1.adjustTime(SWAPNODE_RETRY_TIME/(1000*60));
  762. Owned<IPropertyTreeIterator> swit = info->getElements("Swap");
  763. ForEach(*swit) {
  764. IPropertyTree &swap = swit->query();
  765. const char *dt2s = swap.queryProp("@time");
  766. if (!dt2s||!*dt2s)
  767. continue;
  768. CDateTime dt2;
  769. dt2.setString(dt2s);
  770. if ((dt2.compare(recent)>0)&&(dt1.compare(dt2)>0)) {
  771. wu.setPropBool("@resubmitted",true); // only one attempt
  772. toresubmit.append(wuid);
  773. break;
  774. }
  775. }
  776. }
  777. }
  778. }
  779. ForEachItemIn(ir,toresubmit) {
  780. WuResubmit(toresubmit.item(ir));
  781. }
  782. }
  783. static void doAutoSwapNode(IRemoteConnection *connEnv,IRemoteConnection *connFiles,IPropertyTree *options,bool doswap)
  784. {
  785. if (!checkThorNodeSwap(options,NULL,doswap?5:0)) {
  786. PROGLOG("No bad nodes detected");
  787. PROGLOG("SWAPNODE(auto) exiting");
  788. return;
  789. }
  790. Owned<IGroup> grp;
  791. Owned<IRemoteConnection> connSwapNode;
  792. Owned<IPropertyTree> info;
  793. StringAttr grpname;
  794. if (!getSwapNodeInfo(options,grpname,grp,connSwapNode,info,false)) { // should put out error if returns false
  795. PROGLOG("SWAPNODE(auto) exiting");
  796. return;
  797. }
  798. StringBuffer ts;
  799. if (!info->getProp("@timeChecked",ts)) {
  800. PROGLOG("SWAPNODE(auto): no check information generated");
  801. return;
  802. }
  803. // enumerate bad nodes
  804. StringBuffer xpath;
  805. xpath.appendf("BadNode[@time=\"%s\"]",ts.str());
  806. Owned<IPropertyTreeIterator> it = info->getElements(xpath.str());
  807. SocketEndpointArray epa1;
  808. ForEach(*it) {
  809. IPropertyTree &badnode = it->query();
  810. const char *ip = badnode.queryProp("@netAddress");
  811. if (!ip)
  812. continue;
  813. SocketEndpoint ep(ip);
  814. ep.port = getDaliServixPort();
  815. epa1.append(ep);
  816. }
  817. // recheck
  818. SocketEndpointArray badepa;
  819. UnsignedArray failedcodes;
  820. StringArray failedmessages;
  821. unsigned start = msTick();
  822. validateNodes(epa1,options->getPropBool("SwapNode/@swapNodeCheckC",true),options->getPropBool("SwapNode/@swapNodeCheckD",false),false,options->queryProp("SwapNode/@swapNodeCheckScript"),options->getPropInt("SwapNode/@swapNodeCheckScriptTimeout")*1000,badepa,failedcodes,failedmessages);
  823. if (!badepa.ordinality()) {
  824. PROGLOG("SWAPNODE: on recheck all bad nodes passed (%s,%s)",grpname.get(),ts.str());
  825. return;
  826. }
  827. CDateTime dt;
  828. dt.setNow();
  829. dt.getString(ts.clear());
  830. bool abort=false;
  831. UnsignedArray badrank;
  832. ForEachItemIn(i1,badepa) {
  833. SocketEndpoint ep(badepa.item(i1));
  834. ep.port = 0; // should be no ports in group
  835. StringBuffer ips;
  836. ep.getIpText(ips);
  837. xpath.clear().appendf("BadNode[@netAddress=\"%s\"]",ips.str());
  838. IPropertyTree *bnt = info->queryPropTree(xpath.str());
  839. if (!bnt) {
  840. ERRLOG("SWAPNODE node %s not found in swapnode info!",ips.str());
  841. return;
  842. }
  843. bnt->setProp("@time",ts.str());
  844. int r = bnt->getPropInt("@rank",-1);
  845. if ((int)r<0) { // shouldn't occur
  846. ERRLOG("SWAPNODE node %s rank not found in group %s",ips.str(),grpname.get());
  847. return;
  848. }
  849. badrank.append((unsigned)r);
  850. for (unsigned j1=0;j1<i1;j1++) {
  851. SocketEndpoint ep1(badepa.item(j1));
  852. ep1.port = 0; // should be no ports in group
  853. int r1 = (int)badrank.item(j1);
  854. if ((r==(r1+1)%grp->ordinality())||
  855. (r1==(r+1)%grp->ordinality())) {
  856. StringBuffer ips1;
  857. ep1.getIpText(ips1);
  858. ERRLOG("SWAPNODE adjacent nodes %d (%s) and %d (%s) are bad!",r+1,ips.str(),r1+1,ips1.str());
  859. abort = true;
  860. }
  861. }
  862. }
  863. // now see if any of bad nodes have been swapped out recently
  864. CDateTime recent = dt;
  865. int snint = options->getPropInt("SwapNode/@swapNodeInterval",24);
  866. recent.adjustTime(-60*snint);
  867. it.setown(info->getElements("Swap"));
  868. ForEach(*it) {
  869. IPropertyTree &swappednode = it->query();
  870. CDateTime dt1;
  871. const char *dt1s = swappednode.queryProp("@time");
  872. if (!dt1s||!*dt1s)
  873. continue;
  874. dt1.setString(dt1s);
  875. if (dt1.compare(recent)<0)
  876. continue;
  877. const char *ips = swappednode.queryProp("@outNetAddress");
  878. if (!ips||!*ips)
  879. continue;
  880. int r1 = swappednode.getPropInt("@rank",-1);
  881. SocketEndpoint swappedep(ips);
  882. swappedep.port = 0;
  883. ForEachItemIn(i2,badepa) {
  884. SocketEndpoint badep(badepa.item(i2));
  885. int badr = (int)badrank.item(i2);
  886. badep.port = 0;
  887. if (swappedep.equals(badep)) {
  888. // not sure if *really* want this
  889. ERRLOG("Node %d (%s) was swapped out on %s (too recent)",badr+1,ips,dt1s);
  890. abort = true;
  891. }
  892. else if ((badr==(r1+1)%grp->ordinality())||
  893. (r1==(badr+1)%grp->ordinality())) {
  894. StringBuffer bs;
  895. ERRLOG("SWAPNODE adjacent node to bad node %d (%s), %d (%s) was swapped on %s (too recent) !",badr+1,badep.getIpText(bs).str(),r1+1,ips,dt1s);
  896. abort = true;
  897. }
  898. }
  899. }
  900. const char *intent = doswap?"will":"would";
  901. // find spares
  902. IPropertyTree* rootEnv = connEnv->queryRoot();
  903. SocketEndpointArray spareepa;
  904. StringArray swapfrom;
  905. StringArray swapto;
  906. if (!abort) {
  907. Owned<IPropertyTreeIterator> clusters = connEnv->queryRoot()->getElements("Software/ThorCluster");
  908. ForEach(*clusters) {
  909. IPropertyTree &cluster = clusters->query();
  910. const char *cname = cluster.queryProp("@nodeGroup");
  911. if (!cname||!*cname)
  912. cname = cluster.queryProp("@name");
  913. if (strcmp(grpname.get(),cname)!=0)
  914. continue;
  915. Owned<IPropertyTreeIterator> spares = cluster.getElements("ThorSpareProcess");
  916. ForEach(*spares) {
  917. const char *computer = spares->query().queryProp("@computer");
  918. if (!computer||!*computer) {
  919. WARNLOG("SWAPNODE: spare has no computer specified");
  920. continue;
  921. }
  922. SocketEndpoint nodeep;
  923. if (!resolveComputerName(rootEnv,computer,nodeep)) {
  924. WARNLOG("SWAPNODE: cannot resolve spare at %s",computer);
  925. continue;
  926. }
  927. nodeep.port = 0;
  928. bool found = false;
  929. ForEachItemIn(j1,spareepa) {
  930. if (spareepa.item(j1).ipequals(nodeep)) {
  931. found = true;
  932. break;
  933. }
  934. }
  935. if (!found)
  936. spareepa.append(nodeep);
  937. }
  938. }
  939. ForEachItemIn(i3,badepa) {
  940. StringBuffer from;
  941. badepa.item(i3).getIpText(from);
  942. if (i3<spareepa.ordinality()) {
  943. StringBuffer to;
  944. spareepa.item(i3).getIpText(to);
  945. PROGLOG("SWAPNODE %s swap node %d from %s to %s",intent,badrank.item(i3)+1,from.str(),to.str());
  946. }
  947. else {
  948. abort = true;
  949. ERRLOG("SWAPNODE no spare available to swap for node %d (%s)",badrank.item(i3)+1,from.str());
  950. }
  951. }
  952. }
  953. // now list what can do
  954. if (abort) {
  955. ERRLOG("SWAPNODE: problems found (listed above), no swap %s be attempted",intent);
  956. return;
  957. }
  958. if (!doswap)
  959. return;
  960. // need to release swapnode lock for multi thor not to get deadlocked
  961. connSwapNode.clear();
  962. ensureThorIsDown(grpname,true,true);
  963. ForEachItemIn(i4,badepa) {
  964. StringBuffer from;
  965. badepa.item(i4).getIpText(from);
  966. StringBuffer to;
  967. spareepa.item(i4).getIpText(to);
  968. if (doSingleSwapNode(connEnv,connFiles,grpname,from.str(),to.str(),badrank.item(i4)+1,info,ts.str())) {
  969. StringBuffer msg;
  970. msg.appendf("AUTOSWAPNODE: cluster %s node %d: swapped out %s, swapped in %s",grpname.get(),badrank.item(i4)+1,from.str(),to.str());
  971. EmailSwap(options,msg.str());
  972. FLLOG(MCoperatorError, swapnodeJob, "%s", msg.str());
  973. }
  974. }
  975. return;
  976. }
  977. void autoSwapNode(IPropertyTree *options,bool doswap)
  978. {
  979. PROGLOG("SWAPNODE(auto%s) starting",doswap?",swap":"");
  980. unsigned start = msTick();
  981. loop {
  982. if (doSwapNode(options,doswap,NULL,NULL,NULL,0,true))
  983. break;
  984. if (msTick()-start>SWAPNODE_RETRY_TIME) {
  985. ERRLOG("Retry time exceeded, exiting");
  986. break;
  987. }
  988. WARNLOG("Swapnode pausing before retry");
  989. Sleep(60+(getRandom()%60));
  990. }
  991. }
  992. struct DaliClient
  993. {
  994. DaliClient(const char* daliserver): serverGroup(createIGroup(daliserver, DALI_SERVER_PORT))
  995. {
  996. if (!serverGroup)
  997. throw MakeStringException(0, "Could not instantiate IGroup");
  998. if (!initClientProcess(serverGroup,DCR_Util))
  999. throw MakeStringException(0, "Could not initializing client process");
  1000. setPasswordsFromSDS();
  1001. closeEnvironment();
  1002. }
  1003. ~DaliClient()
  1004. {
  1005. clearPasswordsFromSDS();
  1006. closedownClientProcess();
  1007. }
  1008. Owned<IGroup> serverGroup;
  1009. };
  1010. void suppressStdOut(bool suppress=true)
  1011. {
  1012. static HANDLE out;
  1013. static HANDLE saveout;
  1014. #ifdef WIN32
  1015. if (suppress) {
  1016. saveout = GetStdHandle(STD_OUTPUT_HANDLE);
  1017. out = ::CreateFile("nul",GENERIC_WRITE,0,NULL,CREATE_ALWAYS,FILE_FLAG_WRITE_THROUGH,NULL);
  1018. SetStdHandle(STD_OUTPUT_HANDLE,out);
  1019. }
  1020. else {
  1021. SetStdHandle(STD_OUTPUT_HANDLE,saveout);
  1022. CloseHandle(out);
  1023. }
  1024. #else
  1025. saveout = fileno(stdout);
  1026. #endif
  1027. }
  1028. int main(int argc,char** argv)
  1029. {
  1030. InitModuleObjects();
  1031. int ret = 0;
  1032. bool isauto = (argc>=2)&&(stricmp(argv[1],"auto")==0);
  1033. bool ishistory = (argc>=2)&&(stricmp(argv[1],"history")==0);
  1034. bool isswapped = (argc>=2)&&(stricmp(argv[1],"swapped")==0);
  1035. bool isemail = (argc>=2)&&(stricmp(argv[1],"email")==0);
  1036. if ((argc<5)&&!isauto&&!ishistory&&!isswapped&&!isemail) {
  1037. fprintf(stderr,"Usage: swapnode <daliserver> <thor-cluster> <oldip> <newip> <nodenum>\n");
  1038. fprintf(stderr," or: swapnode history [<days>] -- list swap history \n");
  1039. fprintf(stderr," or: swapnode history [<days>] 2> outfile.csv -- save swap history \n");
  1040. fprintf(stderr," or: swapnode swapped [<days>] -- list currently swapped nodes\n");
  1041. fprintf(stderr," or: swapnode email -- tests email\n");
  1042. fprintf(stderr," or: swapnode auto [swap]\n");
  1043. fprintf(stderr,"NB auto,history,swapped and email must be run in a thor deploy directory \n");
  1044. fprintf(stderr," (e.g. /c$/thor) or in a directory with copy of thor.xml\n");
  1045. fprintf(stderr,"if 'swap' not specified after 'auto' then only displays what *would* be swapped\n");
  1046. ret = 2;
  1047. }
  1048. else {
  1049. try {
  1050. const char* daliserver;
  1051. Owned<IPropertyTree> options;
  1052. if (isauto||ishistory|isswapped|isemail) {
  1053. options.setown(createPTreeFromXMLFile("thor.xml", ipt_caseInsensitive));
  1054. daliserver = options?options->queryProp("@daliServers"):NULL;
  1055. if (!daliserver||!*daliserver)
  1056. throw MakeStringException(-1,"Either thor.xml not found or DALISERVERS not found in thor.xml");
  1057. }
  1058. else {
  1059. options.setown(createPTree(ipt_caseInsensitive)); // don't use thor.xml
  1060. daliserver = argv[1];
  1061. }
  1062. DaliClient dclient(daliserver);
  1063. StringBuffer logname;
  1064. splitFilename(argv[0], NULL, NULL, &logname, NULL);
  1065. addFileTimestamp(logname, true);
  1066. logname.append(".log");
  1067. StringBuffer lf;
  1068. openLogFile(lf, logname.str(),0,false,true);
  1069. queryStderrLogMsgHandler()->setMessageFields(MSGFIELD_prefix);
  1070. if (options&&options->getPropBool("@enableSysLog",true))
  1071. UseSysLogForOperatorMessages();
  1072. if (argc>=5) {
  1073. //DOM- Moved logic to swapnodemain.cpp so I use it within the management console...
  1074. const char* thor=argv[2];
  1075. const char* oldip=argv[3];
  1076. const char* newip=argv[4];
  1077. unsigned nodenum=(argc>5)?atoi(argv[5]):0;
  1078. SwapNode(thor,oldip,newip,nodenum);
  1079. }
  1080. else if (isauto)
  1081. autoSwapNode(options,(argc>2)&&(stricmp(argv[2],"swap")==0));
  1082. else if (ishistory)
  1083. swapNodeHistory(options,(argc>2)?atoi(argv[2]):0,NULL);
  1084. else if (isswapped)
  1085. swappedList(options,(argc>2)?atoi(argv[2]):0,NULL);
  1086. else if (isemail) {
  1087. bool sendswapped = (argc>=3)&&(stricmp(argv[2],"swapped")==0);
  1088. bool sendhistory = (argc>=3)&&(stricmp(argv[2],"history")==0);
  1089. EmailSwap(options, NULL,true, sendswapped,sendhistory);
  1090. }
  1091. }
  1092. catch (IException *e) {
  1093. EXCLOG(e,"SWAPNODE");
  1094. e->Release();
  1095. ret = -1;
  1096. }
  1097. }
  1098. UseSysLogForOperatorMessages(false);
  1099. ExitModuleObjects();
  1100. return ret;
  1101. }
  1102. #endif