浏览代码

Merge pull request #36 from jeng1220/bugfix_cuda_c

Improve solution for CUDA C
Bharatkumar Sharma 3 年之前
父节点
当前提交
c4787d6e5f

+ 197 - 230
hpc/nways/nways_labs/nways_MD/English/C/source_code/cudac/SOLUTION/rdf_malloc.cu

@@ -1,4 +1,3 @@
-// Copyright (c) 2021 NVIDIA Corporation.  All rights reserved. 
 #include <stdio.h>
 #include <iostream>
 #include <fstream>
@@ -13,242 +12,210 @@
 
 using namespace std;
 //additional error handling code
-static void HandleError( cudaError_t err,
-		const char *file,
-		int line ) {
-	if (err != cudaSuccess) {
-		printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
-				file, line );
-		exit( EXIT_FAILURE );
-	}
+static void HandleError(cudaError_t err,
+    const char *file, int line) {
+    if (err != cudaSuccess) {
+        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
+        file, line );
+        exit( EXIT_FAILURE );
+   }
 }
 #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
 
 //declaration of GPU function
-__global__ void pair_gpu(const double* d_x, const double* d_y, const double* d_z,  unsigned long long int *d_g2, int numatm, int nconf, 
-		const double xbox, const double ybox, const double zbox,  int d_bin,  unsigned long long int bl);
+__global__ void pair_gpu(const double* d_x, const double* d_y, const double* d_z,
+                         unsigned long long int *d_g2, int numatm, int nconf, 
+                         double xbox, double ybox, double zbox, int d_bin);
 
-int main(int argc , char* argv[] )
+int main(int argc , char* argv[])
 {
-	double xbox,ybox,zbox;
-	double* h_x,*h_y,*h_z;
-	double* d_x,*d_y,*d_z;
-	unsigned long long int *h_g2,*d_g2;
-	int nbin;
-	int nthreads,device;
-	int numatm,nconf,inconf;
-	unsigned long long int near2;
-	string file;
-
-	///////////////////////////////////////////////////////////////
-
-	inconf = 10;
-	nbin=2000;
-	file = "../input/alk.traj.dcd";
-	device = 0;
-	nthreads = 128;
-	HANDLE_ERROR (cudaSetDevice(device));//pick the device to use
-	///////////////////////////////////////
-	std::ifstream infile;
-	infile.open(file.c_str());
-	if(!infile){
-		cout<<"file "<<file.c_str()<<" not found\n";
-		return 1;
-	}
-	assert(infile);
-
-
-	ofstream pairfile,stwo;
-	pairfile.open("RDF.dat");
-	stwo.open("Pair_entropy.dat");
-
-	/////////////////////////////////////////////////////////
-	dcdreadhead(&numatm,&nconf,infile);
-	cout<<"Dcd file has "<< numatm << " atoms and " << nconf << " frames"<<endl;
-	if (inconf>nconf) cout << "nconf is reset to "<< nconf <<endl;
-	else
-	{nconf=inconf;}
-	cout<<"Calculating RDF for " << nconf << " frames"<<endl;
-	////////////////////////////////////////////////////////
-
-	unsigned long long int sizef= nconf*numatm*sizeof(double);
-	unsigned long long int sizebin= nbin*sizeof(unsigned long long int);
-
-	//Allocate memory on CPU
-	HANDLE_ERROR(cudaHostAlloc((void **)&h_x, sizef, cudaHostAllocDefault));
-	HANDLE_ERROR(cudaHostAlloc((void **)&h_y, sizef, cudaHostAllocDefault));
-	HANDLE_ERROR(cudaHostAlloc((void **)&h_z, sizef, cudaHostAllocDefault));
-	HANDLE_ERROR(cudaHostAlloc((void **)&h_g2, sizebin, cudaHostAllocDefault));
-
-	//Allocate memory on GPU
-	HANDLE_ERROR(cudaMalloc((void**)&d_x, sizef));
-	HANDLE_ERROR(cudaMalloc((void**)&d_y, sizef));
-	HANDLE_ERROR(cudaMalloc((void**)&d_z, sizef));
-	HANDLE_ERROR(cudaMalloc((void**)&d_g2, sizebin));
-
-	HANDLE_ERROR (cudaPeekAtLastError());
-
-	memset(h_g2,0,sizebin);
-
-
-
-	/////////reading cordinates//////////////////////////////////////////////
-	nvtxRangePush("Read_File");
-	double ax[numatm],ay[numatm],az[numatm];
-	for (int i=0;i<nconf;i++) {
-		dcdreadframe(ax,ay,az,infile,numatm,xbox,ybox,zbox);
-		for (int j=0;j<numatm;j++){
-			h_x[i*numatm+j]=ax[j];
-			h_y[i*numatm+j]=ay[j];
-			h_z[i*numatm+j]=az[j];
-		}
-	}
-	nvtxRangePop(); //pop for REading file
-
-
-	nvtxRangePush("Pair_Calculation");
-	//Copy the data from Host to Device before calculation on GPU
-	HANDLE_ERROR(cudaMemcpy(d_g2, h_g2, sizebin,cudaMemcpyHostToDevice));
-	HANDLE_ERROR(cudaMemcpy(d_x, h_x, sizef, cudaMemcpyHostToDevice));
-	HANDLE_ERROR(cudaMemcpy(d_y, h_y, sizef, cudaMemcpyHostToDevice));
-	HANDLE_ERROR(cudaMemcpy(d_z, h_z, sizef, cudaMemcpyHostToDevice));
-
-	cout<<"Reading of input file and transfer to gpu is completed"<<endl;
-	//////////////////////////////////////////////////////////////////////////
-
-	near2=nthreads*(int(0.5*numatm*(numatm-1)/nthreads)+1);
-	unsigned long long int nblock = (near2/nthreads);
-
-	cout<<"Initial blocks are "<<nblock<<" "<<", now changing to ";
-
-	int maxblock=65535;
-	int bl;
-	int blockloop= int(nblock/maxblock);
-	if (blockloop != 0) {
-		nblock=maxblock;
-	}
-	cout<<nblock<<" and will run over "<<(blockloop+1)<<" blockloops"<<endl;
-
-	for (bl=0;bl<(blockloop+1);bl++) {
-		//cout <<bl<<endl;
-		pair_gpu<<< nblock,nthreads >>>
-			(d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, nbin, bl);
-
-		HANDLE_ERROR (cudaPeekAtLastError());
-		HANDLE_ERROR(cudaDeviceSynchronize());
-	}
-
-	HANDLE_ERROR(cudaMemcpy(h_g2, d_g2, sizebin, cudaMemcpyDeviceToHost));
-
-	nvtxRangePop(); //Pop for Pair Calculation
-
-	double pi=acos(-1.0l);
-	double rho=(numatm)/(xbox*ybox*zbox);
-	double norm=(4.0l*pi*rho)/3.0l;
-	double rl,ru,nideal;
-	double g2[nbin];
-	double r,gr,lngr,lngrbond,s2=0.0l,s2bond=0.0l;
-	double box=min(xbox,ybox);
-	box=min(box,zbox);
-	double del=box/(2.0l*nbin);
-	nvtxRangePush("Entropy_Calculation");
-	for (int i=0;i<nbin;i++) {
-		//      cout<<i+1<<" "<<h_g2[i]<<endl;
-		rl=(i)*del;
-		ru=rl+del;
-		nideal=norm*(ru*ru*ru-rl*rl*rl);
-		g2[i]=(double)h_g2[i]/((double)nconf*(double)numatm*nideal);
-		r=(i)*del;
-		pairfile<<(i+0.5l)*del<<" "<<g2[i]<<endl;
-		if (r<2.0l) {
-			gr=0.0l;
-		}
-		else {
-			gr=g2[i];
-		}
-		if (gr<1e-5) {
-			lngr=0.0l;
-		}
-		else {
-			lngr=log(gr);
-		}
-
-		if (g2[i]<1e-6) {
-			lngrbond=0.0l;
-		}
-		else {
-			lngrbond=log(g2[i]);
-		}
-		s2=s2-2.0l*pi*rho*((gr*lngr)-gr+1.0l)*del*r*r;
-		s2bond=s2bond-2.0l*pi*rho*((g2[i]*lngrbond)-g2[i]+1.0l)*del*r*r;
-
-	}
-	nvtxRangePop(); //Pop for Entropy Calculation
-	stwo<<"s2 value is "<<s2<<endl;
-	stwo<<"s2bond value is "<<s2bond<<endl;
-
-
-
-	cout<<"\n\n\n#Freeing Device memory"<<endl;
-	HANDLE_ERROR(cudaFree(d_x));
-	HANDLE_ERROR(cudaFree(d_y));
-	HANDLE_ERROR(cudaFree(d_z));
-	HANDLE_ERROR(cudaFree(d_g2));
-
-	cout<<"#Freeing Host memory"<<endl;
-	HANDLE_ERROR(cudaFreeHost ( h_x ) );
-	HANDLE_ERROR(cudaFreeHost ( h_y ) );
-	HANDLE_ERROR(cudaFreeHost ( h_z ) );
-	HANDLE_ERROR(cudaFreeHost ( h_g2 ) );
-
-	cout<<"#Number of atoms processed: "<<numatm<<endl<<endl;
-	cout<<"#Number of confs processed: "<<nconf<<endl<<endl;
-	cout<<"#number of threads used: "<<nthreads<<endl<<endl;
-	return 0;
+    double xbox,ybox,zbox;
+    double* h_x,*h_y,*h_z;
+    double* d_x,*d_y,*d_z;
+    unsigned long long int *h_g2,*d_g2;
+    int nbin;
+    int device;
+    int numatm,nconf,inconf;
+    string file;
+
+    ///////////////////////////////////////////////////////////////
+
+    inconf = 10;
+    nbin = 2000;
+    file = "../input/alk.traj.dcd";
+    device = 0;
+    HANDLE_ERROR (cudaSetDevice(device));//pick the device to use
+    ///////////////////////////////////////
+    std::ifstream infile;
+    infile.open(file.c_str());
+    if(!infile){
+        cout<<"file "<<file.c_str()<<" not found\n";
+        return 1;
+    }
+    assert(infile);
+
+
+    ofstream pairfile,stwo;
+    pairfile.open("RDF.dat");
+    stwo.open("Pair_entropy.dat");
+
+    /////////////////////////////////////////////////////////
+    dcdreadhead(&numatm,&nconf,infile);
+    cout<<"Dcd file has "<< numatm << " atoms and " << nconf << " frames"<<endl;
+    if (inconf>nconf) cout << "nconf is reset to "<< nconf <<endl;
+    else {nconf = inconf;}
+    cout<<"Calculating RDF for " << nconf << " frames"<<endl;
+    ////////////////////////////////////////////////////////
+
+    unsigned long long int sizef= nconf*numatm*sizeof(double);
+    unsigned long long int sizebin= nbin*sizeof(unsigned long long int);
+
+    //Allocate memory on CPU
+    HANDLE_ERROR(cudaHostAlloc((void **)&h_x, sizef, cudaHostAllocDefault));
+    HANDLE_ERROR(cudaHostAlloc((void **)&h_y, sizef, cudaHostAllocDefault));
+    HANDLE_ERROR(cudaHostAlloc((void **)&h_z, sizef, cudaHostAllocDefault));
+    HANDLE_ERROR(cudaHostAlloc((void **)&h_g2, sizebin, cudaHostAllocDefault));
+
+    //Allocate memory on GPU
+    HANDLE_ERROR(cudaMalloc((void**)&d_x, sizef));
+    HANDLE_ERROR(cudaMalloc((void**)&d_y, sizef));
+    HANDLE_ERROR(cudaMalloc((void**)&d_z, sizef));
+    HANDLE_ERROR(cudaMalloc((void**)&d_g2, sizebin));
+
+    HANDLE_ERROR (cudaPeekAtLastError());
+
+    memset(h_g2,0,sizebin);
+
+    /////////reading cordinates//////////////////////////////////////////////
+    nvtxRangePush("Read_File");
+    double ax[numatm],ay[numatm],az[numatm];
+    for (int i=0;i<nconf;i++) {
+        dcdreadframe(ax,ay,az,infile,numatm,xbox,ybox,zbox);
+        for (int j=0;j<numatm;j++){
+            h_x[i*numatm+j]=ax[j];
+            h_y[i*numatm+j]=ay[j];
+            h_z[i*numatm+j]=az[j];
+        }
+    }
+    nvtxRangePop(); //pop for Reading file
+
+    nvtxRangePush("Pair_Calculation");
+    //Copy the data from Host to Device before calculation on GPU
+    HANDLE_ERROR(cudaMemcpy(d_g2, h_g2, sizebin,cudaMemcpyHostToDevice));
+    HANDLE_ERROR(cudaMemcpy(d_x, h_x, sizef, cudaMemcpyHostToDevice));
+    HANDLE_ERROR(cudaMemcpy(d_y, h_y, sizef, cudaMemcpyHostToDevice));
+    HANDLE_ERROR(cudaMemcpy(d_z, h_z, sizef, cudaMemcpyHostToDevice));
+
+    cout<<"Reading of input file and transfer to gpu is completed"<<endl;
+    //////////////////////////////////////////////////////////////////////////
+    dim3 nthreads(128, 1, 1);
+    dim3 nblock;
+    nblock.x = (numatm + nthreads.x - 1)/nthreads.x;
+    nblock.y = (numatm + nthreads.y - 1)/nthreads.y;
+    nblock.z = 1;
+    pair_gpu<<<nblock, nthreads>>>
+        (d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, nbin);
+
+    HANDLE_ERROR (cudaPeekAtLastError());
+    HANDLE_ERROR(cudaDeviceSynchronize());
+
+    HANDLE_ERROR(cudaMemcpy(h_g2, d_g2, sizebin, cudaMemcpyDeviceToHost));
+
+    nvtxRangePop(); //Pop for Pair Calculation
+
+    double pi=acos(-1.0l);
+    double rho=(numatm)/(xbox*ybox*zbox);
+    double norm=(4.0l*pi*rho)/3.0l;
+    double rl,ru,nideal;
+    double g2[nbin];
+    double r,gr,lngr,lngrbond,s2=0.0l,s2bond=0.0l;
+    double box=min(xbox,ybox);
+    box=min(box,zbox);
+    double del=box/(2.0l*nbin);
+    nvtxRangePush("Entropy_Calculation");
+    for (int i=0;i<nbin;i++) {
+        rl=(i)*del;
+        ru=rl+del;
+        nideal=norm*(ru*ru*ru-rl*rl*rl);
+        g2[i]=(double)h_g2[i]/((double)nconf*(double)numatm*nideal);
+        r=(i)*del;
+        pairfile<<(i+0.5l)*del<<" "<<g2[i]<<endl;
+        if (r<2.0l) {
+            gr=0.0l;
+        }
+        else {
+            gr=g2[i];
+        }
+        if (gr<1e-5) {
+            lngr=0.0l;
+        }
+        else {
+            lngr=log(gr);
+        }
+
+        if (g2[i]<1e-6) {
+            lngrbond=0.0l;
+        }
+        else {
+            lngrbond=log(g2[i]);
+        }
+        s2=s2-2.0l*pi*rho*((gr*lngr)-gr+1.0l)*del*r*r;
+        s2bond=s2bond-2.0l*pi*rho*((g2[i]*lngrbond)-g2[i]+1.0l)*del*r*r;
+
+    }
+    nvtxRangePop(); //Pop for Entropy Calculation
+    stwo<<"s2 value is "<<s2<<endl;
+    stwo<<"s2bond value is "<<s2bond<<endl;
+
+    cout<<"\n\n\n#Freeing Device memory"<<endl;
+    HANDLE_ERROR(cudaFree(d_x));
+    HANDLE_ERROR(cudaFree(d_y));
+    HANDLE_ERROR(cudaFree(d_z));
+    HANDLE_ERROR(cudaFree(d_g2));
+ 
+    cout<<"#Freeing Host memory"<<endl;
+    HANDLE_ERROR(cudaFreeHost(h_x));
+    HANDLE_ERROR(cudaFreeHost(h_y));
+    HANDLE_ERROR(cudaFreeHost(h_z));
+    HANDLE_ERROR(cudaFreeHost(h_g2));
+
+    cout<<"#Number of atoms processed: "<<numatm<<endl<<endl;
+    cout<<"#Number of confs processed: "<<nconf<<endl<<endl;
+    return 0;
 }
 
-__global__ void pair_gpu(
-		const double* d_x, const double* d_y, const double* d_z, 
-		unsigned long long int *d_g2, int numatm, int nconf, 
-		const double xbox,const double ybox,const double zbox,int d_bin,  unsigned long long int bl)
+__global__ void pair_gpu(const double* d_x, const double* d_y, const double* d_z,
+    unsigned long long int *d_g2, int numatm, int nconf, 
+    double xbox, double ybox, double zbox, int d_bin)
 {
-	double r,cut,dx,dy,dz;
-	int ig2,id1,id2;
-	double box;
-	box=min(xbox,ybox);
-	box=min(box,zbox);
-
-	double del=box/(2.0*d_bin);
-	cut=box*0.5;
-	int thisi;
-	double n;
-
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	int maxi = min(int(0.5*numatm*(numatm-1)-(bl*65535*128)),(65535*128));
-
-	if ( i < maxi ) {
-		thisi=bl*65535*128+i;
-
-		n=(0.5)*(1+ ((double) sqrt (1.0+4.0*2.0*thisi)));
-		id1=int(n);
-		id2=thisi-(0.5*id1*(id1-1));
-
-		for (int frame=0;frame<nconf;frame++){
-			dx=d_x[frame*numatm+id1]-d_x[frame*numatm+id2];
-			dy=d_y[frame*numatm+id1]-d_y[frame*numatm+id2];
-			dz=d_z[frame*numatm+id1]-d_z[frame*numatm+id2];
-
-			dx=dx-xbox*(round(dx/xbox));
-			dy=dy-ybox*(round(dy/ybox));
-			dz=dz-zbox*(round(dz/zbox));
-
-			r=sqrtf(dx*dx+dy*dy+dz*dz);
-			if (r<cut) {
-				ig2=(int)(r/del);
-				atomicAdd(&d_g2[ig2],2) ;
-			}
-		}
-	}
+    double r, cut, dx, dy, dz;
+    int ig2;
+    double box;
+    box = min(xbox, ybox);
+    box = min(box, zbox);
+
+    double del = box / (2.0 * d_bin);
+    cut = box * 0.5;
+
+    int id1 = blockIdx.y * blockDim.y + threadIdx.y;
+    int id2 = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (id1 >= numatm || id2 >= numatm) return;
+    if (id1 > id2) return;
+
+    for (int frame = 0; frame < nconf; ++frame) {
+        dx = d_x[frame * numatm + id1] - d_x[frame * numatm + id2];
+        dy = d_y[frame * numatm + id1] - d_y[frame * numatm + id2];
+        dz = d_z[frame * numatm + id1] - d_z[frame * numatm + id2];
+
+        dx = dx - xbox * (round(dx / xbox));
+        dy = dy - ybox * (round(dy / ybox));
+        dz = dz - zbox * (round(dz / zbox));
+
+        r = sqrtf(dx * dx + dy * dy + dz * dz);
+        if (r < cut) {
+            ig2 = (int)(r / del);
+            atomicAdd(&d_g2[ig2], 2);
+        }
+    }
 }
-
-

+ 179 - 205
hpc/nways/nways_labs/nways_MD/English/C/source_code/cudac/SOLUTION/rdf_unified_memory.cu

@@ -12,218 +12,192 @@
 
 using namespace std;
 //additional error handling code
-static void HandleError( cudaError_t err,
-		const char *file,
-		int line ) {
-	if (err != cudaSuccess) {
-		printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
-				file, line );
-		exit( EXIT_FAILURE );
-	}
+static void HandleError(cudaError_t err,
+    const char *file, int line) {
+    if (err != cudaSuccess) {
+        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
+        file, line );
+        exit( EXIT_FAILURE );
+   }
 }
 #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
 
 //declaration of GPU function
-__global__ void pair_gpu(const double* d_x, const double* d_y, const double* d_z,  unsigned long long int *d_g2, int numatm, int nconf, 
-		const double xbox, const double ybox, const double zbox,  int d_bin,  unsigned long long int bl);
+__global__ void pair_gpu(const double* d_x, const double* d_y, const double* d_z,
+                         unsigned long long int *d_g2, int numatm, int nconf, 
+                         double xbox, double ybox, double zbox, int d_bin);
 
-int main(int argc , char* argv[] )
+int main(int argc , char* argv[])
 {
-	double xbox,ybox,zbox;
-	double* d_x,*d_y,*d_z;
-	unsigned long long int *d_g2;
-	int nbin;
-	int nthreads,device;
-	int numatm,nconf,inconf;
-	unsigned long long int near2;
-	string file;
-
-	///////////////////////////////////////////////////////////////
-
-	inconf = 10;
-	nbin=2000;
-	file = "../input/alk.traj.dcd";
-	device = 0;
-	nthreads = 128;
-	HANDLE_ERROR (cudaSetDevice(device));//pick the device to use
-	///////////////////////////////////////
-	std::ifstream infile;
-	infile.open(file.c_str());
-	if(!infile){
-		cout<<"file "<<file.c_str()<<" not found\n";
-		return 1;
-	}
-	assert(infile);
-
-
-	ofstream pairfile,stwo;
-	pairfile.open("RDF.dat");
-	stwo.open("Pair_entropy.dat");
-
-	/////////////////////////////////////////////////////////
-	dcdreadhead(&numatm,&nconf,infile);
-	cout<<"Dcd file has "<< numatm << " atoms and " << nconf << " frames"<<endl;
-	if (inconf>nconf) cout << "nconf is reset to "<< nconf <<endl;
-	else
-	{nconf=inconf;}
-	cout<<"Calculating RDF for " << nconf << " frames"<<endl;
-	////////////////////////////////////////////////////////
-
-	unsigned long long int sizef= nconf*numatm*sizeof(double);
-	unsigned long long int sizebin= nbin*sizeof(unsigned long long int);
-
-	// Allocate Unified Memory -- accessible from CPU or GPU
-	cudaMallocManaged(&d_x, sizef);
-	cudaMallocManaged(&d_y, sizef);
-	cudaMallocManaged(&d_z, sizef);
-	cudaMallocManaged(&d_g2, sizebin);
-
-	HANDLE_ERROR (cudaPeekAtLastError());
-
-	memset(d_g2,0,sizebin);
-
-	/////////reading cordinates//////////////////////////////////////////////
-	nvtxRangePush("Read_File");
-	double ax[numatm],ay[numatm],az[numatm];
-	for (int i=0;i<nconf;i++) {
-		dcdreadframe(ax,ay,az,infile,numatm,xbox,ybox,zbox);
-		for (int j=0;j<numatm;j++){
-			d_x[i*numatm+j]=ax[j];
-			d_y[i*numatm+j]=ay[j];
-			d_z[i*numatm+j]=az[j];
-		}
-	}
-	nvtxRangePop(); //pop for Reading file
-
-	nvtxRangePush("Pair_Calculation");
-
-	cout<<"Reading of input file and transfer to gpu is completed"<<endl;
-	//////////////////////////////////////////////////////////////////////////
-
-	near2=nthreads*(int(0.5*numatm*(numatm-1)/nthreads)+1);
-	unsigned long long int nblock = (near2/nthreads);
-
-	cout<<"Initial blocks are "<<nblock<<" "<<", now changing to ";
-
-	int maxblock=65535;
-	int bl;
-	int blockloop= int(nblock/maxblock);
-	if (blockloop != 0) {
-		nblock=maxblock;
-	}
-	cout<<nblock<<" and will run over "<<(blockloop+1)<<" blockloops"<<endl;
-
-	for (bl=0;bl<(blockloop+1);bl++) {
-		//cout <<bl<<endl;
-		pair_gpu<<< nblock,nthreads >>>
-			(d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, nbin, bl);
-
-		HANDLE_ERROR (cudaPeekAtLastError());
-		HANDLE_ERROR(cudaDeviceSynchronize());
-	}
-
-	nvtxRangePop(); //Pop for Pair Calculation
-
-	double pi=acos(-1.0l);
-	double rho=(numatm)/(xbox*ybox*zbox);
-	double norm=(4.0l*pi*rho)/3.0l;
-	double rl,ru,nideal;
-	double g2[nbin];
-	double r,gr,lngr,lngrbond,s2=0.0l,s2bond=0.0l;
-	double box=min(xbox,ybox);
-	box=min(box,zbox);
-	double del=box/(2.0l*nbin);
-	nvtxRangePush("Entropy_Calculation");
-	for (int i=0;i<nbin;i++) {
-		//      cout<<i+1<<" "<<d_g2[i]<<endl;
-		rl=(i)*del;
-		ru=rl+del;
-		nideal=norm*(ru*ru*ru-rl*rl*rl);
-		g2[i]=(double)d_g2[i]/((double)nconf*(double)numatm*nideal);
-		r=(i)*del;
-		pairfile<<(i+0.5l)*del<<" "<<g2[i]<<endl;
-		if (r<2.0l) {
-			gr=0.0l;
-		}
-		else {
-			gr=g2[i];
-		}
-		if (gr<1e-5) {
-			lngr=0.0l;
-		}
-		else {
-			lngr=log(gr);
-		}
-
-		if (g2[i]<1e-6) {
-			lngrbond=0.0l;
-		}
-		else {
-			lngrbond=log(g2[i]);
-		}
-		s2=s2-2.0l*pi*rho*((gr*lngr)-gr+1.0l)*del*r*r;
-		s2bond=s2bond-2.0l*pi*rho*((g2[i]*lngrbond)-g2[i]+1.0l)*del*r*r;
-
-	}
-	nvtxRangePop(); //Pop for Entropy Calculation
-	stwo<<"s2 value is "<<s2<<endl;
-	stwo<<"s2bond value is "<<s2bond<<endl;
-
-	cout<<"#Freeing memory"<<endl;
-	  // Free memory
-	HANDLE_ERROR(cudaFree(d_x));
-	HANDLE_ERROR(cudaFree(d_y));
-	HANDLE_ERROR(cudaFree(d_z));
-	HANDLE_ERROR(cudaFree(d_g2));
-
-	cout<<"#Number of atoms processed: "<<numatm<<endl<<endl;
-	cout<<"#Number of confs processed: "<<nconf<<endl<<endl;
-	cout<<"#number of threads used: "<<nthreads<<endl<<endl;
-	return 0;
+    double xbox,ybox,zbox;
+    double* d_x,*d_y,*d_z;
+    unsigned long long int *d_g2;
+    int nbin;
+    int device;
+    int numatm,nconf,inconf;
+    string file;
+
+    ///////////////////////////////////////////////////////////////
+
+    inconf = 10;
+    nbin = 2000;
+    file = "../input/alk.traj.dcd";
+    device = 0;
+    HANDLE_ERROR (cudaSetDevice(device));//pick the device to use
+    ///////////////////////////////////////
+    std::ifstream infile;
+    infile.open(file.c_str());
+    if(!infile){
+        cout<<"file "<<file.c_str()<<" not found\n";
+        return 1;
+    }
+    assert(infile);
+
+
+    ofstream pairfile,stwo;
+    pairfile.open("RDF.dat");
+    stwo.open("Pair_entropy.dat");
+
+    /////////////////////////////////////////////////////////
+    dcdreadhead(&numatm,&nconf,infile);
+    cout<<"Dcd file has "<< numatm << " atoms and " << nconf << " frames"<<endl;
+    if (inconf>nconf) cout << "nconf is reset to "<< nconf <<endl;
+    else {nconf = inconf;}
+    cout<<"Calculating RDF for " << nconf << " frames"<<endl;
+    ////////////////////////////////////////////////////////
+
+    unsigned long long int sizef= nconf*numatm*sizeof(double);
+    unsigned long long int sizebin= nbin*sizeof(unsigned long long int);
+
+    // Allocate Unified Memory -- accessible from CPU or GPU
+    cudaMallocManaged(&d_x, sizef);
+    cudaMallocManaged(&d_y, sizef);
+    cudaMallocManaged(&d_z, sizef);
+    cudaMallocManaged(&d_g2, sizebin);
+
+    HANDLE_ERROR (cudaPeekAtLastError());
+
+    memset(d_g2,0,sizebin);
+
+    /////////reading cordinates//////////////////////////////////////////////
+    nvtxRangePush("Read_File");
+    double ax[numatm],ay[numatm],az[numatm];
+    for (int i=0;i<nconf;i++) {
+        dcdreadframe(ax,ay,az,infile,numatm,xbox,ybox,zbox);
+        for (int j=0;j<numatm;j++){
+            d_x[i*numatm+j]=ax[j];
+            d_y[i*numatm+j]=ay[j];
+            d_z[i*numatm+j]=az[j];
+        }
+    }
+    nvtxRangePop(); //pop for Reading file
+
+    nvtxRangePush("Pair_Calculation");
+
+    cout<<"Reading of input file and transfer to gpu is completed"<<endl;
+    //////////////////////////////////////////////////////////////////////////
+    dim3 nthreads(128, 1, 1);
+    dim3 nblock;
+    nblock.x = (numatm + nthreads.x - 1)/nthreads.x;
+    nblock.y = (numatm + nthreads.y - 1)/nthreads.y;
+    nblock.z = 1;
+    pair_gpu<<<nblock, nthreads>>>
+        (d_x, d_y, d_z, d_g2, numatm, nconf, xbox, ybox, zbox, nbin);
+
+    HANDLE_ERROR (cudaPeekAtLastError());
+    HANDLE_ERROR(cudaDeviceSynchronize());
+
+
+    nvtxRangePop(); //Pop for Pair Calculation
+
+    double pi=acos(-1.0l);
+    double rho=(numatm)/(xbox*ybox*zbox);
+    double norm=(4.0l*pi*rho)/3.0l;
+    double rl,ru,nideal;
+    double g2[nbin];
+    double r,gr,lngr,lngrbond,s2=0.0l,s2bond=0.0l;
+    double box=min(xbox,ybox);
+    box=min(box,zbox);
+    double del=box/(2.0l*nbin);
+    nvtxRangePush("Entropy_Calculation");
+    for (int i=0;i<nbin;i++) {
+        rl=(i)*del;
+        ru=rl+del;
+        nideal=norm*(ru*ru*ru-rl*rl*rl);
+        g2[i]=(double)d_g2[i]/((double)nconf*(double)numatm*nideal);
+        r=(i)*del;
+        pairfile<<(i+0.5l)*del<<" "<<g2[i]<<endl;
+        if (r<2.0l) {
+            gr=0.0l;
+        }
+        else {
+            gr=g2[i];
+        }
+        if (gr<1e-5) {
+            lngr=0.0l;
+        }
+        else {
+            lngr=log(gr);
+        }
+
+        if (g2[i]<1e-6) {
+            lngrbond=0.0l;
+        }
+        else {
+            lngrbond=log(g2[i]);
+        }
+        s2=s2-2.0l*pi*rho*((gr*lngr)-gr+1.0l)*del*r*r;
+        s2bond=s2bond-2.0l*pi*rho*((g2[i]*lngrbond)-g2[i]+1.0l)*del*r*r;
+
+    }
+    nvtxRangePop(); //Pop for Entropy Calculation
+    stwo<<"s2 value is "<<s2<<endl;
+    stwo<<"s2bond value is "<<s2bond<<endl;
+
+    cout<<"#Freeing memory"<<endl;
+    // Free memory
+    HANDLE_ERROR(cudaFree(d_x));
+    HANDLE_ERROR(cudaFree(d_y));
+    HANDLE_ERROR(cudaFree(d_z));
+    HANDLE_ERROR(cudaFree(d_g2));
+
+    cout<<"#Number of atoms processed: "<<numatm<<endl<<endl;
+    cout<<"#Number of confs processed: "<<nconf<<endl<<endl;
+    return 0;
 }
 
-__global__ void pair_gpu(
-		const double* d_x, const double* d_y, const double* d_z, 
-		unsigned long long int *d_g2, int numatm, int nconf, 
-		const double xbox,const double ybox,const double zbox,int d_bin,  unsigned long long int bl)
+__global__ void pair_gpu(const double* d_x, const double* d_y, const double* d_z,
+    unsigned long long int *d_g2, int numatm, int nconf, 
+    double xbox, double ybox, double zbox, int d_bin)
 {
-	double r,cut,dx,dy,dz;
-	int ig2,id1,id2;
-	double box;
-	box=min(xbox,ybox);
-	box=min(box,zbox);
-
-	double del=box/(2.0*d_bin);
-	cut=box*0.5;
-	int thisi;
-	double n;
-
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	int maxi = min(int(0.5*numatm*(numatm-1)-(bl*65535*128)),(65535*128));
-
-	if ( i < maxi ) {
-		thisi=bl*65535*128+i;
-
-		n=(0.5)*(1+ ((double) sqrt (1.0+4.0*2.0*thisi)));
-		id1=int(n);
-		id2=thisi-(0.5*id1*(id1-1));
-
-		for (int frame=0;frame<nconf;frame++){
-			dx=d_x[frame*numatm+id1]-d_x[frame*numatm+id2];
-			dy=d_y[frame*numatm+id1]-d_y[frame*numatm+id2];
-			dz=d_z[frame*numatm+id1]-d_z[frame*numatm+id2];
-
-			dx=dx-xbox*(round(dx/xbox));
-			dy=dy-ybox*(round(dy/ybox));
-			dz=dz-zbox*(round(dz/zbox));
-
-			r=sqrtf(dx*dx+dy*dy+dz*dz);
-			if (r<cut) {
-				ig2=(int)(r/del);
-				atomicAdd(&d_g2[ig2],2) ;
-			}
-		}
-	}
+    double r, cut, dx, dy, dz;
+    int ig2;
+    double box;
+    box = min(xbox, ybox);
+    box = min(box, zbox);
+
+    double del = box / (2.0 * d_bin);
+    cut = box * 0.5;
+
+    int id1 = blockIdx.y * blockDim.y + threadIdx.y;
+    int id2 = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (id1 >= numatm || id2 >= numatm) return;
+    if (id1 > id2) return;
+
+    for (int frame = 0; frame < nconf; ++frame) {
+        dx = d_x[frame * numatm + id1] - d_x[frame * numatm + id2];
+        dy = d_y[frame * numatm + id1] - d_y[frame * numatm + id2];
+        dz = d_z[frame * numatm + id1] - d_z[frame * numatm + id2];
+
+        dx = dx - xbox * (round(dx / xbox));
+        dy = dy - ybox * (round(dy / ybox));
+        dz = dz - zbox * (round(dz / zbox));
+
+        r = sqrtf(dx * dx + dy * dy + dz * dz);
+        if (r < cut) {
+            ig2 = (int)(r / del);
+            atomicAdd(&d_g2[ig2], 2);
+        }
+    }
 }
-
-