-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsearch-cron
executable file
·160 lines (133 loc) · 4.48 KB
/
search-cron
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/bin/sh
SOURCE_DIR=$( cd `dirname "$0"`; pwd )
. $SOURCE_DIR/common.sh
MUST_REMAIN_SPACE=500000 # 500 Mb
SCRIPTS_DIR=$SOURCE_DIR
CRAWL_SCRIPT=$SCRIPTS_DIR/crawl
REINDEX_SCRIPT=$SCRIPTS_DIR/reindex
FIXSEGMENTS_SCRIPT=$SCRIPTS_DIR/fixSegments
RSNAPSHOT_SCRIPT=/usr/bin/rsnapshot
CURWD=`pwd`
MAX_FETCH_TIME=2
MAX_BACKUP_TIME=1
LOGS_DIR=$1
BACKUPS_DIR=$2
do_core() {
CORE_NAME=$1
CRAWLDB_DIR=${CORES_HOME}/${CORE_NAME}/crawldb
if [ ! -e "${CRAWLDB_DIR}" ]; then
mkdir ${CRAWLDB_DIR}
else
$FIXSEGMENTS_SCRIPT ${CORE_NAME}
fi
CRAWLDB_SIZE=`du --max-depth=0 $CRAWLDB_DIR | awk '{print $1}'`
SPACE_LEFT=`df -k $BACKUPS_DIR | tail -1 | awk '{print $4}'`
REMAINING_SIZE=$(($SPACE_LEFT - $CRAWLDB_SIZE))
BACKUP_OUT_LOG=$LOGS_DIR/${DATE}-OUT-backup-${1}.log
BACKUP_ERR_LOG=$LOGS_DIR/${DATE}-ERROR-backup-${1}.log
CRAWL_OUT_LOG=$LOGS_DIR/${DATE}-OUT-crawl-${1}.log
CRAWL_ERR_LOG=$LOGS_DIR/${DATE}-ERROR-crawl-${1}.log
REINDEX_OUT_LOG=$LOGS_DIR/${DATE}-OUT-reindex-${1}.log
REINDEX_ERR_LOG=$LOGS_DIR/${DATE}-ERROR-reindex-${1}.log
# Create a backup of the crawl DB, just in case
if [ -e "${CORES_HOME}/${CORE_NAME}/crawldb" ]; then
if [ $REMAINING_SIZE -le $MUST_REMAIN_SPACE ]; then
echo "${1}: Not enough space to create the backup";
return 1;
fi
$RSNAPSHOT_SCRIPT -c ${CORES_HOME}/${CORE_NAME}/rsnapshot.conf daily > $BACKUP_OUT_LOG 2> $BACKUP_ERR_LOG;
RETCODE=$?
# Check if the backup failed
if [ $RETCODE -ne 0 ]; then
echo "${1}: Backup failed (${RETCODE}), see backup log at ${BACKUP_ERR_LOG}";
rm -rf $BACKUPS_DIR/crawl-${1}-$DATE;
return 1;
fi
rm -rf ${CORES_HOME}/${CORE_NAME}/crawldb
fi
# # Remove old segments
# if [ -e $CRAWLDB_DIR/segments ]; then
# cd $CRAWLDB_DIR/segments && find . -maxdepth 1 -type d -not -path . -mtime +${MAX_FETCH_TIME} -exec rm -f '{}' \;
# $FIXSEGMENTS_SCRIPT ${CORE_NAME}
# cd $CURWD
# fi
# Crawl the website
$CRAWL_SCRIPT $1 > $CRAWL_OUT_LOG 2> $CRAWL_ERR_LOG;
RETCODE=$?
# Check if an error occured while crawling the website
if [ $RETCODE -ne 0 ]; then
echo "${1}: Crawl failed (${RETCODE}), see error log at ${CRAWL_ERR_LOG}";
return 1;
else
echo "------------ TERMINE ------------" >> $CRAWL_OUT_LOG;
# Remove old log files
find $LOGS_DIR -type f -mtime +${MAX_FETCH_TIME} -exec rm -rf {} \; &> /dev/null
# Remove old backups
find $BACKUPS_DIR -type f -mtime +${MAX_BACKUP_TIME} -exec rm -rf {} \; &> /dev/null
# Reindex all pages
$REINDEX_SCRIPT $1 > $REINDEX_OUT_LOG 2> $REINDEX_ERR_LOG
RETCODE=$?
# Check if reindexation failed
if [ $RETCODE -ne 0 ]; then
echo "${1}: Reindexation failed (${RETCODE})";
return 1;
fi
fi
}
check_status() {
RETCODE=$?;
if [ $RETCODE -ne 0 ]; then
exit $RETCODE;
fi
}
cleanup() {
# The Hadoop tmp directory can take all available space
HADOOP_TMP_DIR=/tmp/hadoop-$USER
if [ -e "$HADOOP_TMP_DIR" ]; then
rm -rf $HADOOP_TMP_DIR;
fi
NUTCH_HOME=${CORES_HOME}/${1}/apache-nutch/
rm -f ${NUTCH_HOME}/logs/hadoop.*;
}
if [ "$LOGS_DIR" = "" ] || [ "$BACKUPS_DIR" = "" ]; then
echo "You must provide the path to the logs and backups directories (ex: ./search-cron ./logs ./backups)"
else
if [ -e "${CORES_HOME}/search.pid" ]; then
SEARCH_PID=`cat ${CORES_HOME}/search.pid`
IS_RUNNING=`kill -0 $SEARCH_PID`
if [ "$IS_RUNNING" = "0" ]; then
echo "Script already running, exiting";
exit 1;
fi
else
echo $$ > ${CORES_HOME}/search.pid;
fi
if [ ! -e $LOGS_DIR ]; then
mkdir -p $LOGS_DIR; check_status;
fi
if [ ! -e $BACKUPS_DIR ]; then
mkdir -p $BACKUPS_DIR; check_status;
fi
DATE=`date +\%Y-\%m-\%d-\%H\%M`
echo "Cleaning up english index"
cleanup rq-en;
echo "Indexing english site"
do_core rq-en;
RETCODE_EN=$?
cleanup rq-en;
echo "Cleaning up french index"
cleanup rq-fr;
echo "Indexing french site"
do_core rq-fr;
RETCODE_FR=$?
cleanup rq-fr;
# Delete empty log files
find $LOGS_DIR -type f -size 0 -exec rm -f '{}' \;
rm -f ${CORES_HOME}/search.pid;
if [ "$RETCODE_FR" -ne "0" ]; then
exit 1;
fi
if [ "$RETCODE_EN" -ne "0" ]; then
exit 1;
fi
fi