It works on gziped pdb files so that the folder pdb can be compressed to be smaller
All pdb files must be in the same folder (set1) as it does not traverse subdirectories
It is limited to at most ~30,000 files in one folder so I had to split the ~72,000 files mirrored off the PDB into three folders (set1, set2 and set3) and generate an output file for each individually. the files can then be merged together into one using cat.
import glob, os, pymol, sys
from pymol import cmd
from chempy import cpv
the_pdb="/Users/cale/pdb/set1"
files = glob.glob(the_pdb+os.sep+"*.ent.gz")
if not len(files):
print "Please set 'the_pdb' variable to a valid path containing PDB files."
sys.exit(1)
else:
print "Processing %d files." % len(files)
s, outFile = "resn HIS and name ND1", "dist_set1.csv"
f = open(outFile, 'wb')
# write the header
f.write("PDB\tCHAIN\tRESI\tATOM-A\tCHAIN\tRESI\tATOM-B\tDISTANCE\n")
# for each file in the mirror
for x in files:
cmd.load(x,finish=1)
n = cmd.get_names()[0]
m = cmd.get_model(s).atom
# pairwise for each atom
for aa in m:
for bb in m:
# avoid distances to self
if aa==bb: continue
# avoid duplicates
if aa>bb: continue
distance = cpv.distance(aa.coord, bb.coord)
# don't list if distance is above 10 angstroms
# if distance > 10 : continue
f.write( "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%f\n" %
(n, aa.chain, aa.resi, aa.index,
bb.chain, bb.resi, bb.index,
distance))
cmd.delete(n)
f.close()
print "Processed %d files. Please see %s for results." % (len(files), outFile)