Hi, Ewan:
At Glasgow we don't see such jobs recently, but between Oct 21 -27
we have 12 such jobs which got killed. Take one job for example.
In ARC log we have:
gm-jobs.log-20141026:2014-10-25 15:48:16 Finished - job id:
9fcKDmVzE2knbbfC3pqhhxZmABFKDmABFKDm5HKKDmBBFKDmnR9qZn, unix user:
201847:201010, name: "arc_pilot", owner:
"/C=UK/O=eScience/OU=CLRC/L=RAL/CN=alastair dewhurst", lrms: condor,
queue: condor_q2d, lrmsid: 315877.svr019.gla.scotgrid.ac.uk, failure:
"LRMS error: (271) job killed: vmem."
In condor history we have the following lines recorded for this job:
NumCkpts_RAW = 0
BufferSize = 524288
NiceUser = false
CoreSize = 0
CumulativeSlotTime = 0
OnExitHold = false
RequestCpus = 8
Err =
"/var/spool/arc/grid/9fcKDmVzE2knbbfC3pqhhxZmABFKDmABFKDm5HKKDmBBFKDmnR9qZn.comment"
BufferBlockSize = 32768
ExecutableSize_RAW = 10
x509userproxy =
"/var/spool/arc/jobstatus/job.9fcKDmVzE2knbbfC3pqhhxZmABFKDmABFKDm5HKKDmBBFKDmnR9qZn.proxy"
ImageSize = 10
CurrentTime = time()
WantCheckpoint = false
CommittedTime = 0
TargetType = "Machine"
WhenToTransferOutput = "ON_EXIT_OR_EVICT"
Cmd =
"/var/spool/arc/grid/9fcKDmVzE2knbbfC3pqhhxZmABFKDmABFKDm5HKKDmBBFKDmnR9qZn/condorjob.sh"
JobUniverse = 5
ExitBySignal = false
TransferIn = false
Iwd =
"/var/spool/arc/grid/9fcKDmVzE2knbbfC3pqhhxZmABFKDmABFKDm5HKKDmBBFKDmnR9qZn"
NumRestarts = 0
CommittedSuspensionTime = 0
Owner = "prdatlas047"
NumSystemHolds = 0
CumulativeSuspensionTime = 0
AccountingGroup = strcat(AcctGroup,".",AcctSubGroup,".",Owner)
NextJobStartDelay = ifThenElse(regexp("nagios",Owner),10,0.1)
Environment = "LFC_CONRETRYINT=10 _=/usr/bin/condor_submit
X509_USER_CERT=fake LD_LIBRARY_PATH=/usr/lib64 LFC_CONNTIMEOUT=30
PWD=/var/spool/arc/grid/9fcKDmVzE2knbbfC3pqhhxZmABFKDmABFKDm5HKKDmBBFKDmnR9qZn
X509_CERT_DIR=/etc/grid-security/certificates X509_USER_KEY=fake SHLVL=2
CONDOR_BIN_PATH=/usr/bin LANG=en_GB.UTF-8
GRIDMAP=/etc/grid-security/grid-mapfile
LFC_HOST=prod-lfc-shared-central.cern.ch TERM=xterm
RUNTIME_CONFIG_DIR=/etc/arc/runtime OLDPWD=/ LFC_CONRETRY=1
CONDOR_CONFIG=/etc/condor/condor_config ARC_LOCATION=/usr
X509_VOMS_DIR=/etc/grid-security/vomsdir
PATH=/sbin:/usr/sbin:/bin:/usr/bin OPENSSL_ALLOW_PROXY_CERTS=1
GLOBUS_LOCATION=/usr
X509_USER_PROXY=/var/spool/arc/jobstatus/job.9fcKDmVzE2knbbfC3pqhhxZmABFKDmABFKDm5HKKDmBBFKDmnR9qZn.proxy"
RequestDisk = DiskUsage
Requirements = ( ( OpSys == "LINUX" ) ) && ( TARGET.Arch == "X86_64" )
&& ( TARGET.Disk >= RequestDisk ) && ( TARGET.Memory >= RequestMemory )
&& ( TARGET.Cpus >= RequestCpus ) && ( TARGET.HasFileTransfer )
MinHosts = 1
JobNotification = 0
NumCkpts = 0
LastSuspensionTime = 0
NumJobStarts = 0
WantRemoteSyscalls = false
JobLeaseDuration = 7200
AcctGroup =
ifThenElse(regexp("scotg001",Owner),"group_HIGHPRIO",ifThenElse(x509UserProxyVOName
=?= "atlas","group_ATLAS",ifThenElse(x509UserProxyVOName =?=
"cms","group_CMS",ifThenElse(x509UserProxyVOName =?=
"dteam","group_DTEAM_OPS",ifThenElse(x509UserProxyVOName =?=
"ops","group_DTEAM_OPS",ifThenElse(regexp("nagios",Owner),"group_OTHER","group_NONLHC"))))))
JobPrio = 0
RootDir = "/"
CurrentHosts = 0
x509UserProxyExpiration = 1414573322
MaxWallTime = ifThenElse(x509UserProxyVOName =?= "atlas",48 * 60 *
60,ifThenElse(x509UserProxyVOName =?= "cms",48 * 60 *
60,ifThenElse(x509UserProxyVOName =?= "dteam",24 * 60 *
60,ifThenElse(x509UserProxyVOName =?= "ops",24 * 60 * 60,48 * 60 * 60))))
WantRemoteIO = true
StreamOut = false
OnExitRemove = true
AcctSubGroup = ifThenElse(regexp("prdcms",Owner) && RequestCpus >
1,"cms_prod_multicore",ifThenElse(regexp("pilcms",Owner) && RequestCpus
>
1,"cms_pilot_multicore",ifThenElse(regexp("prdcms",Owner),"cms_prod",ifThenElse(regexp("pilcms",Owner),"cms_pilot",ifThenElse(regexp("cms",Owner),"cms",ifThenElse(regexp("prdatlas",Owner)
&& RequestCpus >
1,"prodatls_multicore",ifThenElse(regexp("pilatlas",Owner) &&
RequestCpus >
1,"atlas_pilot_multicore",ifThenElse(regexp("prdatlas",Owner),"atlas_prod",ifThenElse(regexp("pilatlas",Owner),"atlas_pilot",ifThenElse(regexp("atlas",Owner),"atlas",ifThenElse(regexp("dteam",Owner),"dteam",ifThenElse(regexp("ops",Owner),"ops",ifThenElse(regexp("nagios",Owner),"nagios","none")))))))))))))
In = "/dev/null"
DiskUsage = 22
PeriodicRemove = false || ResidentSetSize > JobMemoryLimit
LocalUserCpu = 0.0
RemoteUserCpu = 0.0
TransferInput =
"/var/spool/arc/grid/9fcKDmVzE2knbbfC3pqhhxZmABFKDmABFKDm5HKKDmBBFKDmnR9qZn/.gahp_complete,/var/spool/arc/grid/9fcKDmVzE2knbbfC3pqhhxZmABFKDmABFKDm5HKKDmBBFKDmnR9qZn/runpilot3-wrapper.sh"
ExecutableSize = 10
LocalSysCpu = 0.0
RemoteSysCpu = 0.0
ClusterId = 315877
CompletionDate = 0
JobDescription = "arc_pilot"
RemoteWallClockTime = 0.0
Rank = 0.0
MachineScaling = "$$([ifThenElse(isUndefined(Scaling), 1.00, Scaling)])"
x509UserProxyFQAN = "/C=UK/O=eScience/OU=CLRC/L=RAL/CN=alastair
dewhurst,/atlas/Role=production/Capability=NULL,/atlas/Role=NULL/Capability=NULL,/atlas/lcg1/Role=NULL/Capability=NULL,/atlas/phys-beauty/Role=NULL/Capability=NULL,/atlas/team/Role=NULL/Capability=NULL,/atlas/uk/Role=NULL/Capability=NULL"
LeaveJobInQueue = false
ImageSize_RAW = 10
x509UserProxyEmail = "[log in to unmask]"
CondorVersion = "$CondorVersion: 8.2.2 Aug 07 2014 BuildID: 265643 $"
MyType = "Job"
StreamErr = false
DiskUsage_RAW = 21
PeriodicHold = false
User = "[log in to unmask]"
ConcurrencyLimits = strcat(AcctGroup,",",AcctSubGroup,",",Owner)
x509UserProxyFirstFQAN = "/atlas/Role=production/Capability=NULL"
Arguments = ""
Out =
"/var/spool/arc/grid/9fcKDmVzE2knbbfC3pqhhxZmABFKDmABFKDm5HKKDmBBFKDmnR9qZn.comment"
UserLog =
"/var/spool/arc/grid/9fcKDmVzE2knbbfC3pqhhxZmABFKDmABFKDm5HKKDmBBFKDmnR9qZn/log"
PeriodicRelease = false
MaxHosts = 1
RequestMemory = 16000
CommittedSlotTime = 0
TotalSuspensions = 0
x509userproxysubject = "/C=UK/O=eScience/OU=CLRC/L=RAL/CN=alastair dewhurst"
x509UserProxyVOName = "atlas"
CondorPlatform = "$CondorPlatform: x86_64_RedHat6 $"
TransferInputSizeMB = 0
ExitStatus = 0
ShouldTransferFiles = "YES"
EnteredCurrentStatus = 1414237659
QDate = 1414237659
CumulativeSlotTime = 45760.0
StatsLifetimeStarter = 5719
LastMatchTime = 1414242737
NumJobMatches = 1
LastJobLeaseRenewal = 1414248457
RecentStatsLifetimeStarter = 1200
NordugridQueue = condor_q2d
GlobalJobId = "svr019.gla.scotgrid.ac.uk#315877.0#1414237659"
LastRemoteHost = "[log in to unmask]"
ImageSize = 17500000
BlockReads = 21352
RemoveReason = "The job attribute PeriodicRemove expression 'false ||
ResidentSetSize > JobMemoryLimit' evaluated to TRUE"
BytesRecvd = 33201.0
RecentBlockReadKbytes = 268020
ResidentSetSize = 17500000
RecentBlockWrites = 90
LastVacateTime = 1414248457
OrigMaxHosts = 1
RecentBlockReads = 3015
MemoryUsage = ( ( ResidentSetSize + 1023 ) / 1024 )
BlockWrites = 851
LastSuspensionTime = 0
NumJobStarts = 1
JobMemoryLimit = 16384000
JobFinishedHookDone = 1414248457
BytesSent = 0.0
JobStartDate = 1414242737
CurrentHosts = 0
BlockWriteKbytes = 3712
BlockReadKbytes = 2059024
DiskUsage = 750
ResidentSetSize_RAW = 16532060
RemoteUserCpu = 42715.0
RemoteSysCpu = 236.0
LastRejMatchTime = 1414242673
RemoteWallClockTime = 5720.0
MachineAttrSlotWeight0 = 8
ImageSize_RAW = 16539044
JobRunCount = 1
LastRejMatchReason = "no match found"
DiskUsage_RAW = 550
ProcId = 0
JobCurrentStartExecutingDate = 1414242738
LastJobStatus = 2
JobCurrentStartDate = 1414242737
MATCH_EXP_MachineScaling = "7.700000000000000E-01"
JobStatus = 3
LastPublicClaimId = "<10.141.0.69:39774>#1411725411#50861#..."
AutoClusterAttrs =
"RequestDisk,JobUniverse,LastCheckpointPlatform,NumCkpts,RequestMemory,_cp_orig_RequestCpus,_cp_orig_RequestDisk,_cp_orig_RequestMemory,RequestCpus,westmere,wn,interlagos,NODE_IS_HEALTHY,viglen,DiskUsage,Requirements,Rank,NiceUser,ConcurrencyLimits"
RecentBlockWriteKbytes = 360
StartdPrincipal = "[log in to unmask]"
AutoClusterId = 11204
NumShadowStarts = 1
MachineAttrCpus0 = 8
EnteredCurrentStatus = 1414248457
Let me know if you need more info.
Cheers,Gang
On 12/11/2014 14:59, Ewan MacMahon wrote:
> Hi all,
>
> I'm investigating some atlas multicore jobs that seem to be getting killed off by our condor for excessive memory usage, but I'm getting a little lost in a twisty maze of slightly different constraints on things.
>
> Could anyone (or everyone) with an atlas supporting multicore condor setup possibly send me (off-list, I imagine) the output of 'condor_history -l' for a couple of random completed (or killed) multicore jobs please, just so I can play spot-the-difference?
>
> Thanks,
>
> Ewan
|