Print

Print


i dont know if this helps:

[root@arxiloxos6 root]# ps -ax
  PID TTY      STAT   TIME COMMAND
    1 ?        S      0:06 init
    2 ?        SW     0:00 [migration/0]
    3 ?        SW     0:00 [migration/1]
    4 ?        SW     0:00 [keventd]
    5 ?        SWN    0:00 [ksoftirqd/0]
    6 ?        SWN    0:00 [ksoftirqd/1]
    9 ?        SW     0:00 [bdflush]
    7 ?        SW     0:02 [kswapd]
    8 ?        SW     0:09 [kscand]
   10 ?        SW     0:03 [kupdated]
   11 ?        SW     0:00 [mdrecoveryd]
   15 ?        SW     0:14 [kjournald]
   70 ?        SW     0:00 [khubd]
 1504 ?        SW     0:00 [kjournald]
 1878 ?        S      0:05 syslogd -m 0
 1882 ?        S      0:00 klogd -x
 1892 ?        S      0:24 irqbalance
 1909 ?        S      0:00 portmap
 1928 ?        S      0:00 rpc.statd
 1939 ?        S      0:00 mdadm --monitor --scan -f
 1986 ?        SW     0:00 [rpciod]
 1987 ?        SW     0:00 [lockd]
 2030 ?        S      0:00 /usr/sbin/smartd
 2039 ?        S      0:00 /usr/sbin/sshd
 2053 ?        S      0:00 xinetd -stayalive -pidfile /var/run/xinetd.pid
 2076 ?        SL     0:09 ntpd -U ntp -p /var/run/ntpd.pid
 2093 ?        S      0:22 zhm arxiloxos6.inp.demokritos.gr
 2113 ?        S      0:09 sendmail: accepting connections
 2122 ?        S      0:00 sendmail: Queue runner@01:00:00 for
/var/spool/clientmqueue
 2132 ?        S      0:00 gpm -t imps2 -m /dev/mouse
 2141 ?        S      0:00 crond
 2150 ?        S      9:18 /usr/sbin/pbs_mom -p
 2173 ?        S      0:00 xfs -droppriv -daemon
 2190 ?        S      0:00 /usr/sbin/atd
 2209 tty1     S      0:00 /sbin/mingetty tty1
 2210 tty2     S      0:00 /sbin/mingetty tty2
 2211 tty3     S      0:00 /sbin/mingetty tty3
 2212 tty4     S      0:00 /sbin/mingetty tty4
 2213 tty5     S      0:00 /sbin/mingetty tty5
 2214 tty6     S      0:00 /sbin/mingetty tty6
 2215 ?        S      0:00 /usr/bin/gdm-binary -nodaemon
 2373 ?        S      0:00 /usr/bin/gdm-binary -nodaemon
 2374 ?        S     19:11 /usr/X11R6/bin/X :0 -auth /var/gdm/:0.Xauth vt7
 2389 ?        S      3:54 /usr/bin/gdmgreeter
 6108 ?        S      0:00 -sh
 6310 ?        S      0:00 /bin/sh
/var/spool/pbs/mom_priv/jobs/3572.xg009..SC
 6314 ?        S      0:00 /usr/bin/perl -w /tmp/bootstrap.AC6311
/home/dteam002/ xg009.inp.demokritos.gr
/home/dteam002/.globus/.gass_cache/local/md5/15/de2
 6320 ?        S      0:00 /usr/bin/perl -w /tmp/bootstrap.AC6311
/home/dteam002/ xg009.inp.demokritos.gr
/home/dteam002/.globus/.gass_cache/local/md5/15/de2
 6573 ?        S      0:00 /usr/bin/perl -w /tmp/bootstrap.AC6311
/home/dteam002/ xg009.inp.demokritos.gr
/home/dteam002/.globus/.gass_cache/local/md5/15/de2
 6784 ?        S      0:00 bash
/home/dteam002/globus-tmp.arxiloxos6.6314.0/globus-tmp.arxiloxos6.6314.0/local/md5/15/de2f91fa165b6e72c05ba3b3fa20c0/md5/60/9
 6852 ?        S      0:00 bash
/home/dteam002/globus-tmp.arxiloxos6.6314.0/globus-tmp.arxiloxos6.6314.0/local/md5/15/de2f91fa165b6e72c05ba3b3fa20c0/md5/60/9
 6853 ?        S      0:00 /bin/bash ./testJob.sh
 6854 ?        SN     0:01 python2 /opt/lcg/bin/lcg-mon-wn -j
https://gdrb02.cern.ch:9000/-2gUNHn_skh6LaVH0T6r4Q -p
/tmp/globus-tmp.arxiloxos6.6314.0 -l 3572
 6855 ?        S      0:00 perl -e ?    while (1) {?      $time_left =
`grid-proxy-info -timeleft 2> /dev/null` || 0;?      last if ($time_left
<= 0);?
 6873 ?        S      0:00 perl ./run-test sft-lcg-rm
 6898 ?        S      0:00 /bin/bash tests/sft-lcg-rm
 8269 ?        S      0:00 perl ./run-test sft-lcg-rm-cr
 8279 ?        S      0:00 /bin/bash tests/sft-lcg-rm-cr
 8290 ?        S      0:00 lcg-cr -v --vo dteam -d xg006.inp.demokritos.gr
-l lfn:sft-lcg-rm-cr-arxiloxos6.inp.demokritos.gr.0510112121
file:///home/dteam002
22905 ?        S      0:00 sshd: root@pts/0
22907 pts/0    S      0:00 -bash
23060 pts/0    R      0:00 ps -ax


> Filippidis christos wrote:
>
>> hi again,
>>
>> i have an sft job with a "problem" right know (actually its "running
>> from
>> yesterday )
>>
>> the info i can get for this job is:
>> (i dont know how to get more)
>
> Do not trust the output of "qstat" or "pbsnodes": PBS/Torque has bugs and
> occasionally it will get into a bad state.  Login on the WN and look
> around
> with "ps" etc.
>
>> arxiloxos6.inp.demokritos.gr
>>      state = free
>>      np = 2
>>      properties = lcgpro
>>      ntype = cluster
>>      jobs = 0/3572.xg009.inp.demokritos.gr
>>      status = arch=linux,uname=Linux arxiloxos6.inp.demokritos.gr
>> 2.4.21-32.0.1.EL.cernsmp #1 SMP Thu May 26 12:29:50 CEST 2005
>> i686,sessions=2389
>> 6108,nsessions=2,nusers=2,idletime=5778321,totmem=1554432kb,availmem=1111516kb,physmem=510216kb,ncpus=2,loadave=0.00,rectime=1129124056
>>
>> [root@xg009 root]# qstat
>> Job id           Name             User             Time Use S Queue
>> ---------------- ---------------- ---------------- -------- - -----
>> 3572.xg009         STDIN            dteam002         00:00:22 R dteam
>>
>> you can see at
>> https://lcg-sft.cern.ch:9443/sft/sitehistory.cgi?site=xg009.inp.demokritos.gr
>> this cause many problems because for today i dont have new sft jobs
>> probably because its seams that there are a dteam job that is running,
>>
>> if i delete this job then i will have new sft jobs util 18:00 and then
>> it
>> will happen the same
>>
>>
>> thanks
>> xristos
>>
>>
>>
>>>Hi Guys,
>>>
>>>I was trying to figure out why the test job could hang, but I must
>>>admit that I was unable to reproduce the problem. Normally all tests
>>>are killed automatically after 15 minutes by the SIGALRM signal
>>>handler (the signal handler sends KILL signal to test process), and
>>>when I try to simulate hanging tests everything works fine for me.
>>>
>>>Could you please check the list of running processes on the WN when
>>>it happens next time? And if it's possible if you could also note
>>>down the time when the job actually started to execute and when you
>>>checked the process table...
>>>This is the most obvious way we can investigate what is happening.
>>>
>>>Piotr
>>>
>>>On Oct 12, 2005, at 1:00 PM, Gerhard Walzel wrote:
>>>
>>>
>>>>Judit
>>>>I have exact the same problem on site Hephy-Vienna
>>>>Just starting at 0015 !
>>>>Last days I have simply removed the job to enable
>>>>Sft tests again...
>>>>Gerhard
>>>>
>>>>
>>>>On 10/12/05 11:59 AM, "NOVAK Judit" <[log in to unmask]> wrote:
>>>>
>>>>
>>>>
>>>>>Hi Christos,
>>>>>
>>>>>
>>>>>In the site history I can see two Job Submission failures,
>>>>>both from last week. The last one run to a timeout (while gstat
>>>>>reports many free CPUs -- is it all OK with the batch system?).
>>>>>
>>>>>
>>>>>Judit
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>On k, okt 11, Filippidis christos wrote:
>>>>>
>>>>>
>>>>>>hi to all,
>>>>>>
>>>>>>i have the following problem:
>>>>>>
>>>>>>our site here at demokritos is passing the sft but the last week
>>>>>>every day
>>>>>> when  dteam002 "/c=ch/o=cern/ou=grid/cn=judit novak 0973" send
>>>>>>an sft at
>>>>>>18:00 the job never ends or it stop the next day and the result
>>>>>>is CT or js
>>>>>>
>>>>>>the same time when i send an sft from this site:
>>>>>>https://monitoring.egee.man.poznan.pl/
>>>>>>everythink is ok,
>>>>>>
>>>>>>
>>>>>>it is also  strange  that when judit novak send an sft at an
>>>>>>other period
>>>>>>of the day ,for example the morning, the sft is succesfull.
>>>>>>
>>>>>>do you have any ideas?
>>>>>>
>>>>>>thanks xristos
>>>>>>
>>>>>>
>>>>>>Christos Filippidis
>>>>>>NCSR DEMOKRITOS
>>>>>>Institute of Nuclear Physics
>>>>>>office block 6(ktirion 6)
>>>>>>Gr-15310 Agia Paraskevi
>>>>>>GREECE
>>>>>>Tel:2106503425
>>>>>>
>>>>>>http://consult.cern.ch/xwho/people/117002
>>>>>>http://www.inp.demokritos.gr/~filippidisx/
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>----------------------------------------------
>>>>>>
>>>>>>"Institute of Nuclear Physics NCSR Demokritos"
>>>>>> http://www.inp.demokritos.gr/
>>>>>>
>>>>>>
>>>>>>
>>>>>>Christos Filippidis
>>>>>>NCSR DEMOKRITOS
>>>>>>Institute of Nuclear Physics
>>>>>>office block 6(ktirion 6)
>>>>>>Gr-15310 Agia Paraskevi
>>>>>>GREECE
>>>>>>Tel:2106503425
>>>>>>
>>>>>>http://consult.cern.ch/xwho/people/117002
>>>>>>http://www.inp.demokritos.gr/~filippidisx/
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>----------------------------------------------
>>>>>>
>>>>>>"Institute of Nuclear Physics NCSR Demokritos"
>>>>>> http://www.inp.demokritos.gr/
>>>>
>>
>>
>> Christos Filippidis
>> NCSR DEMOKRITOS
>> Institute of Nuclear Physics
>> office block 6(ktirion 6)
>> Gr-15310 Agia Paraskevi
>> GREECE
>> Tel:2106503425
>>
>> http://consult.cern.ch/xwho/people/117002
>> http://www.inp.demokritos.gr/~filippidisx/
>>
>>
>>
>>
>>
>> ----------------------------------------------
>>
>> "Institute of Nuclear Physics NCSR Demokritos"
>>  http://www.inp.demokritos.gr/
>
>


Christos Filippidis
NCSR DEMOKRITOS
Institute of Nuclear Physics
office block 6(ktirion 6)
Gr-15310 Agia Paraskevi
GREECE
Tel:2106503425

http://consult.cern.ch/xwho/people/117002
http://www.inp.demokritos.gr/~filippidisx/





----------------------------------------------

"Institute of Nuclear Physics NCSR Demokritos"
 http://www.inp.demokritos.gr/