-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathgetSeoSitemap.php
2195 lines (1701 loc) · 66.8 KB
/
getSeoSitemap.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<?php
/*
getSeoSitemap v5.0.0 LICENSE | 2023-02-27
getSeoSitemap v5.0.0 is distributed under the following BSD-style license:
Copyright (c) 2017-2023
Giovanni Bertone (RED Racing Parts)
https://www.redracingparts.com
red@redracingparts.com
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. All advertising materials mentioning features or use of this software
must display the following acknowledgement:
This product includes software developed by the RED Racing Parts.
4. Neither the name of the RED Racing Parts nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GIOVANNI BERTONE ''AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GIOVANNI BERTONE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//###################################################################################################
//# Please support this project by making a donation via PayPal to https://www.paypal.me/johnbe4 or #
//# with BTC bitcoin to the address 19928gKpqdyN6CHUh4Tae1GW9NAMT6SfQH #
//###################################################################################################
//#################################################
//##### WARNING: DO NOT CHANGE ANYTHING BELOW #####
//#################################################
require 'config.php';
class getSeoSitemap {
private $version = 'v5.0.0';
private $userAgent = 'getSeoSitemapBot/ver.';
private $scriptTitle = 'getSeoSitemap ver. by John';
private $url = null; // an aboslute URL ( ex. https://www.example.com/test/test1.php )
private $size = 0; // size of file in Kb
private $titleLength = [15, 100]; // min, max title length
private $descriptionLength = [50, 160]; // min, max description length
private $md5 = null; // md5 of string (hexadecimal)
private $changefreq = null; // change frequency of file (values: daily, weekly, monthly, yearly)
private $lastmod = null; // timestamp of last modified date of URL
private $state = null; // state of URL
/*
state values:
old = URL of previous scan
new = new URL to scan
scan = new URL already scanned
skip = new generic skipped URL (out of domain, video, image, iframe, audio and link)
mSkip = new skipped URL cause of mailto
rSkip = new skipped URL cause of robots.txt rules
niSkip = new no-index URL cause of robots meta rules
nfSkip = new no-follow URL cause of robots meta rules
noSkip = new no-index / no-follow URL cause of robots meta rules
*/
private $insUrl = null;
private $mysqli = null; // mysqli connection
private $ch = null; // curl connection
private $row = []; // array that includes row from query
private $pageLinks = []; // it includes all links inside a page
private $pageBody = null; // the page including header
private $httpCode = null; // the http response code
private $contentType = null; // the header content-type
private $rowNum = null; // number of rows into dbase
private $count = null; // count of rows (ex. 125)
private $query = null; // query
private $stmt = null; // statement for prepared query
private $stmt2 = null; // statement 2 for prepared query
private $stmt3 = null; // statement 3 for prepared query
private $stmt4 = null; // statement 4 for prepared query
private $stmt5 = null; // statement 5 for prepared query
private $stmt6 = null; // statement 6 for prepared query
private $stmt7 = null; // statement 7 for prepared query
private $startTime = null; // start timestamp
private $followExclusion = [ // do not follow links inside these file content types
'application/pdf',
];
private $seoExclusion = [ // file content types out of seo
'application/pdf',
'application/javascript',
'text/javascript'
];
private $indexExclusion = [ // file content types out of sitemap
'application/javascript',
'text/javascript'
];
private $changefreqArr = ['daily', 'weekly', 'monthly', 'yearly']; // changefreq accepted values
private $priorityArr = ['1.0', '0.9', '0.8', '0.7', '0.6', '0.5', '0.4', '0.3', '0.2', '0.1']; // priority accepted values
private $exec = 'n'; // execution value (could be y or n)
private $errCounter = 0; // error counter
private $maxErr = 20; // max number of errors to stop execution
private $errMsg = [
'C01' => 'cURL error for multiple choices server response'
];
private $escapeCodeArr = [ // escape code conversions
'&' => '&',
"'" => "'",
'"' => '"',
'>' => '>',
'<' => '<',
];
private $maxUrlsInSitemap = 50000; // max number of URLs into a single sitemap
private $maxTotalUrls = 2500000000; // max total number of URLs
private $totUrls = null; // total URLs at the end
private $sitemapMaxSize = 52428800; // max sitemap size (bytes)
private $sitemapNameArr = []; // includes names of all saved sitemaps at the end of the process
private $maxUrlLength = 767; // max URL length
private $maxPageSize = 16777215; // max page size | bytes
private $seoMaxPageSize = 153600; // page max file size in byte. this param is only for SEO
private $malfChars = [' ']; // list of characters to detect malformed URLs following a standard good practice
private $multipleSitemaps = null; // when multiple sitemaps are avaialble is true
private $logPath = null; // log path
private $skipUrl = []; // URLs to skip
private $allowUrl = []; // URLs to allow
private $robotsPath = null; // robots.txt path
private $robotsLines = []; // robots.txt lines
private $countUrlWithoutDesc = 0; // counter of URLs without description
private $countUrlWithMultiDesc = 0; // counter of URLs with multiple description
private $countUrlWithoutTitle = 0; // counter of URLs without title
private $countUrlWithMultiTitle = 0; // counter of URLs with multiple title
private $countUrlWithoutH1 = 0; // counter of URLs without h1
private $countUrlWithMultiH1 = 0; // counter of URLs with multiple h1
private $countUrlWithoutH2 = 0; // counter of URLs without h2
private $countUrlWithoutH3 = 0; // counter of URL without h3
private $callerUrl = null; // caller URL of normal URL
private $skipCallerUrl = null; // caller URL of skipped URL
private $countQuery = 0; // counter of queries with INSERT, UPDATE or DELETE on getSeoSitemap table
private $optimTimes = 500000; // exec optimize of getSeoSitemap table every x $countQuery
private $titDesLen = 250; // max title / description length to save (characters)
private $maxDuplTitDes = 1000; // max number of URL with duplicated title or description to write in log
//################################################################################
//################################################################################
public function __construct(){
mysqli_report(MYSQLI_REPORT_ALL ^ MYSQLI_REPORT_INDEX);
try {
$this->prep();
$this->fullScan();
$this->writeLog('Scan end'.PHP_EOL);
$this->end();
}
catch (Exception $e){
$this->writeLog('Execution has been stopped | '.$e->getMessage(), $e->getFile(), $e->getLine());
$this->stopExec();
}
catch (ExitException | mysqli_sql_exception $e){
$this->writeLog('Execution has been stopped | '.$e->getMessage().' | Remember to set exec => n on getSeoSitemapExec table',
$e->getFile(), $e->getLine());
exit();
}
}
//################################################################################
//################################################################################
private function curlExec($url, $att){
$this->pageBody = curl_exec($this->ch);
if ($this->pageBody === false) {
$this->writeLog('curl_exec failed on '.$att.'° attempt - cURL error: '.curl_error($this->ch).' - URL: '.$url);
$this->getErrCounter();
$this->pageBody = '';
$this->httpCode = 'C01';
$this->size = 0;
$this->md5 = md5($this->pageBody);
$this->lastmod = time();
return;
}
$header = curl_getinfo($this->ch);
if ($header === false) {
throw new Exception('curl_getinfo failed - URL: '.$url);
}
$this->httpCode = $header['http_code'];
$this->contentType = $header['content_type'];
$this->size = $header['size_download'];
$this->md5 = md5($this->pageBody);
$this->lastmod = time();
}
//################################################################################
//################################################################################
private function getPage($url){
curl_setopt($this->ch, CURLOPT_URL, $url);
$this->curlExec($url, 1);
if ($this->httpCode !== 200){
usleep(5000000); // 5 sec
$this->curlExec($url, 2);
}
}
//################################################################################
//################################################################################
private function pageTest($url){
$this->insUrl = true;
// if mailto URL
if (strpos($url, 'mailto') === 0) {
$this->insSkipUrl($url, 'mSkip');
$this->insUrl = false;
return;
}
//### the 'if elseif below' is faster than two 'if + return'
// if out of domain URL
if (strpos($url, DOMAINURL) !== 0) {
$this->insSkipUrl($url, 'skip');
$this->insUrl = false;
}
// if robots skipped URL
elseif ($this->robotsSkipTest($url) === true) {
$this->insSkipUrl($url, 'rSkip');
$this->insUrl = false;
}
}
//################################################################################
//################################################################################
// open mysqli connection
private function openMysqliConn(){
$this->mysqli = new mysqli(DBHOST, DBUSER, DBPASS, DBNAME);
$this->mysqli->set_charset('utf8mb4');
}
//################################################################################
//################################################################################
private function execQuery(){
// reset row
$this->row = [];
$result = $this->mysqli->query($this->query);
// if query is select....
if (strpos($this->query, 'SELECT') === 0) {
// if query is SELECT SQL_NO_CACHE COUNT(*) AS count
if (strpos($this->query, 'SELECT SQL_NO_CACHE COUNT(*) AS count') === 0) {
$row = $result->fetch_assoc();
$this->count = $row['count'];
}
else {
$i = 0;
// this while is faster than the equivalent for
while (($row = $result->fetch_assoc()) !== null) {
$this->row[$i] = $row;
$i++;
}
$this->rowNum = $result->num_rows;
}
$result->free_result();
}
// else if query is show....
elseif (strpos($this->query, 'SHOW') === 0) {
$this->rowNum = $result->num_rows;
$result->free_result();
}
}
//################################################################################
//################################################################################
// close mysqli statements
private function closeMysqliStmt(){
$this->stmt->close();
$this->stmt2->close();
$this->stmt3->close();
$this->stmt4->close();
$this->stmt5->close();
$this->stmt7->close();
}
//################################################################################
//################################################################################
private function update(){
// to prevent error on empty page
if ($this->row[0]['size'] > 0) {
if ($this->row[0]['md5'] !== $this->md5) {
$newLastmod = $this->lastmod;
}
else {
$newLastmod = $this->row[0]['lastmod'];
}
$lastmodDiff = $this->lastmod - $this->row[0]['lastmod'];
// set changefreq weekly if lastmod date difference is more than 1 week
if ($lastmodDiff > 604799 && $lastmodDiff < 2678400) {
$this->changefreq = 'weekly';
}
// set changefreq monthly if lastmod date difference is more than 31 days
elseif ($lastmodDiff > 2678399 && $lastmodDiff < 31536000) {
$this->changefreq = 'monthly';
}
// set changefreq yearly if lastmod date difference is more than 365 days
elseif ($lastmodDiff > 31535999) {
$this->changefreq = 'yearly';
}
$this->lastmod = $newLastmod;
}
}
//################################################################################
//################################################################################
private function getIndexFollowSeo($url){
/*
return if httpCode !== 200 (to prevent checking of failed pages) or if
$this->pageBody is empty (to prevent error on $dom->loadHTML($this->pageBody))
*/
if ($this->httpCode !== 200 || empty($this->pageBody) === true) {
return;
}
$index = $this->getExclusion($this->contentType, $this->indexExclusion);
$follow = $this->getExclusion($this->contentType, $this->followExclusion);
$seo = $this->getExclusion($this->contentType, $this->seoExclusion);
$dom = new DOMDocument;
try {
$pageBody = mb_convert_encoding($this->pageBody, 'HTML-ENTITIES', 'UTF-8');
}
catch (Error $e){
$this->writeLog('mb_convert_encoding failed on URL '.$url.' | Error: '.$e->getMessage(), $e->getFile(), $e->getLine());
return;
}
if ($pageBody === false){
$this->writeLog('mb_convert_encoding failed on URL '.$url.' | pageBody '.var_export($pageBody, true));
return;
}
if (@$dom->loadHTML($pageBody) === false) {
$this->writeLog('DOMDocument->loadHTML failed on URL '.$url.' | pageBody '.var_export($pageBody, true));
return;
}
$descriptionCount = 0;
foreach ($dom->getElementsByTagName('meta') as $val) {
$valGetAttName = strtolower($val->getAttribute('name'));
if ($valGetAttName === 'robots') {
$valGetAttContent = $val->getAttribute('content');
switch (strtolower($valGetAttContent)) {
case 'noindex':
$index = false;
break;
case 'nofollow':
$follow = false;
break;
case 'none':
$index = $follow = $seo = false;
break;
case 'noindex, nofollow':
$index = $follow = $seo = false;
break;
default:
$this->writeLog('Content of robots tag is not included in the list: content '.$valGetAttContent.' - URL '.$url);
}
}
elseif ($valGetAttName === 'description') {
$description = $val->getAttribute('content');
$descriptionCount++;
}
}
if ($index === false && $follow === false && $seo === false){
$this->insSkipUrl($url, 'noSkip');
return;
}
if ($index === false && $follow === false){
$this->insSkipUrl($url, 'noSkip');
}
elseif ($index === false) {
$this->insSkipUrl($url, 'niSkip');
}
elseif ($follow === false) {
$this->insSkipUrl($url, 'nfSkip');
}
//### seo start
if ($seo === true) {
$skipUrl = [];
// count h1
$h1Count = $dom->getElementsByTagName('h1')->length;
if ($h1Count > 1) {
$this->writeLog('There are '.$h1Count.' h1 (SEO: h1 should be single) - URL '.$url);
$this->countUrlWithMultiH1++;
}
elseif ($h1Count === 0) {
$this->writeLog('H1 does not exist (SEO: h1 should be present) - URL '.$url);
$this->countUrlWithoutH1++;
}
if (CHECKH2 === true){
// count h2
if ($dom->getElementsByTagName('h2')->length === 0) {
$this->writeLog('H2 does not exist (SEO: h2 should be present) - URL '.$url);
$this->countUrlWithoutH2++;
}
}
if (CHECKH3 === true){
// count h3
if ($dom->getElementsByTagName('h3')->length === 0) {
$this->writeLog('H3 does not exist (SEO: h3 should be present) - URL '.$url);
$this->countUrlWithoutH3++;
}
}
$titleArr = $dom->getElementsByTagName('title');
$titleCount = $titleArr->length;
if ($titleCount === 1) {
$title = $titleArr->item(0)->textContent;
$titleLength = strlen($title);
if ($titleLength > $this->titDesLen) {
$this->writeLog('Title length: '.$titleLength.' characters (title has not been registered into dBase because of its length '
. 'is more than '.$this->titDesLen.' characters) - URL '.$url);
$title = null;
}
}
elseif ($titleCount > 1) {
$this->writeLog('There are '.$titleCount.' titles (title has not been registered into dBase because is not single) - URL '.$url);
$title = null;
$this->countUrlWithMultiTitle++;
}
elseif ($titleCount === 0) {
$this->writeLog('Title does not exist (SEO: title should be present) - URL '.$url);
$title = null;
$this->countUrlWithoutTitle++;
}
if ($descriptionCount === 1) {
$descriptionLength = strlen($description);
if ($descriptionLength > $this->titDesLen) {
$this->writeLog('Description length: '.$descriptionLength.' characters (description has not been registered into dBase because '
. 'of its length is more than '.$this->titDesLen.' characters) - URL '.$url);
$description = null;
}
}
elseif ($descriptionCount > 1) {
$this->writeLog('There are '.$descriptionCount.' descriptions '
. '(description has not been registered into dBase because is not single) - URL '.$url);
$description = null;
$this->countUrlWithMultiDesc++;
}
elseif ($descriptionCount === 0) {
$this->writeLog('Description does not exist (SEO: description should be present) - URL '.$url);
$description = null;
$this->countUrlWithoutDesc++;
}
$this->optimCheck();
$this->stmt5->bind_param('sss', $title, $description, $url);
$this->stmt5->execute();
// iterate over extracted imgs and display their URLs
foreach ($dom->getElementsByTagName('img') as $img){
// get absolute URL of image
$absImg = $this->getAbsoluteUrl($img->getAttribute('src'), $url, 'img-src');
$skipUrl[] = $absImg;
// check if img title and img alt are present and length >= 1
if (strlen($img->getAttribute('title')) < 1){
$this->writeLog('Image without title: '.$absImg.' - URL: '.$url);
}
if (strlen($img->getAttribute('alt')) < 1){
$this->writeLog('Image without alt: '.$absImg.' - URL: '.$url);
}
}
// iterate over extracted links and display their URLs
foreach ($dom->getElementsByTagName('link') as $link){
$skipUrl[] = $this->getAbsoluteUrl($link->getAttribute('href'), $url, 'link-href');
}
// iterate over extracted iframes and display their URLs
foreach ($dom->getElementsByTagName('iframe') as $iframe){
$skipUrl[] = $this->getAbsoluteUrl($iframe->getAttribute('src'), $url, 'iframe-src');
}
// iterate over extracted video and display their URLs
foreach ($dom->getElementsByTagName('video') as $video){
$skipUrl[] = $this->getAbsoluteUrl($video->getAttribute('src'), $url, 'video-src');
}
// iterate over extracted audios and display their URLs
foreach ($dom->getElementsByTagName('audio') as $audio){
$skipUrl[] = $this->getAbsoluteUrl($audio->getAttribute('src'), $url, 'audio-src');
}
// set skipCallerUrl to prepare pageTest in case of calling insSkipUrl from pageTest
$this->skipCallerUrl = $url;
// array_filter removes empty / false field
foreach (array_filter($skipUrl) as $v) {
$this->insSkipUrl($v, 'skip');
}
}
//### seo end
//### follow start
if ($follow === true){
// reset pageLinks
$this->pageLinks = [];
// iterate over extracted links and display their URLs
foreach ($dom->getElementsByTagName('a') as $a) {
$this->pageLinks[] = $this->getAbsoluteUrl($a->getAttribute('href'), $url, 'a-href');
}
// iterate over extracted forms and get their action URLs
foreach ($dom->getElementsByTagName('form') as $form){
// check and scan form with get method only
if ($form->getAttribute('method') === 'get'){
$this->pageLinks[] = $this->getAbsoluteUrl($form->getAttribute('action'), $url, 'get-method-action');
}
}
// iterate over extracted scripts and display their URLs
foreach ($dom->getElementsByTagName('script') as $script){
$scriptSrc = $script->getAttribute('src');
// get absolute URL script src if src exits only (this is to prevent error when script does not have src)
if ($scriptSrc !== ''){
$absScript = $this->getAbsoluteUrl($scriptSrc, $url, 'script-src');
$this->pageLinks[] = $absScript;
}
}
$this->pageLinks = array_unique(array_filter($this->pageLinks));
}
//### follow end
}
//################################################################################
//################################################################################
private function end(){
// delete old records of previous full scan
$this->query = "DELETE FROM getSeoSitemap WHERE state = 'old'";
$this->execQuery();
$this->writeLog('Deleted old URLs');
$this->query = "SELECT SQL_NO_CACHE COUNT(*) AS count FROM getSeoSitemap";
$this->execQuery();
$this->writeLog($this->count.' scanned URLs');
$this->writeLog($this->countUrlWithoutTitle.' URLs without title into domain | SEO: title should be present');
$this->writeLog($this->countUrlWithMultiTitle.' URLs with multiple title into domain | SEO: title should be single');
$this->writeLog($this->countUrlWithoutDesc.' URLs without description into domain | SEO: description should be present');
$this->writeLog($this->countUrlWithMultiDesc.' URLs with multiple description into domain | SEO: description should be single');
$this->writeLog($this->countUrlWithoutH1.' URLs without h1 into domain | SEO: h1 should be present');
$this->writeLog($this->countUrlWithMultiH1.' URLs with multiple h1 into domain | SEO: h1 should be single');
if (CHECKH2 === true){
$this->writeLog($this->countUrlWithoutH2.' URLs without h2 into domain | SEO: h2 should be present');
}
if (CHECKH3 === true){
$this->writeLog($this->countUrlWithoutH3.' URLs without h3 into domain | SEO: h3 should be present');
}
$this->openCurlConn();
$this->checkSkipUrls();
// close msqli statements
$this->closeMysqliStmt();
$this->query = "SELECT SQL_NO_CACHE * FROM getSeoSitemap WHERE httpCode != '200' AND state != 'mSkip' ORDER BY url";
$this->execQuery();
if ($this->rowNum > 0) {
$this->writeLog('##### Failed URLs | They are not included into sitemap');
foreach ($this->row as $value) {
if (array_key_exists($value['httpCode'], $this->errMsg) === true) {
$logMsg = $this->errMsg[$value['httpCode']].' '.$value['httpCode'].' - URL: '.$value['url'].' - caller URL: '.$value['callerUrl'];
}
else {
$logMsg = 'Http code '.$value['httpCode'].' - URL: '.$value['url'].' - caller URL: '.$value['callerUrl'];
}
$this->writeLog($logMsg);
}
$this->writeLog('##########');
}
$this->writeLog($this->rowNum.' failed URLs | They are not included into sitemap'.PHP_EOL);
$this->setPriority();
$this->optimTables();
$this->writeLog('##### SEO');
$this->getSizeList();
$this->getMinTitleLengthList();
$this->getMaxTitleLengthList();
$this->getDuplicateTitle();
$this->getMinDescriptionLengthList();
$this->getMaxDescriptionLengthList();
$this->getDuplicateDescription();
$this->getIntUrls();
$this->getPriority();
// write changefreq into log
foreach ($this->changefreqArr as $value) {
$this->query = "SELECT SQL_NO_CACHE COUNT(*) AS count FROM getSeoSitemap "
. "WHERE changefreq = '$value' AND state NOT IN ('skip', 'mSkip', 'rSkip', 'niSkip', 'noSkip') AND httpCode = '200'";
$this->execQuery();
$this->writeLog('Set '.$value.' change frequency to '.$this->count.' URLs into sitemap');
}
// write lastmod min and max values into log
$this->query = "SELECT SQL_NO_CACHE MIN(lastmod) AS minLastmod, MAX(lastmod) AS maxLastmod FROM getSeoSitemap "
. "WHERE state NOT IN ('skip', 'mSkip', 'rSkip', 'niSkip', 'noSkip') AND httpCode = '200'";
$this->execQuery();
$minLastmodDate = date('Y.m.d H:i:s', $this->row[0]['minLastmod']);
$maxLastmodDate = date('Y.m.d H:i:s', $this->row[0]['maxLastmod']);
$this->writeLog('Min last modified time into sitemap is '.$minLastmodDate);
$this->writeLog('Max last modified time into sitemap is '.$maxLastmodDate.PHP_EOL);
// save all sitemaps
if ($this->save() !== true){
throw new Exception('save failed');
}
// gzip all sitemaps
foreach ($this->sitemapNameArr as $key => $value) {
$this->gzip($value);
$newValue = $value.'.gz';
$fileName = $this->getFileName($newValue);
$this->writeLog('Saved '.$fileName);
// updte filePath into array
$this->sitemapNameArr[$key] = $newValue;
}
// get full sitemap
$fullSitemapNameArr = $this->getSitemapPaths(SITEMAPPATH);
// create an array of all sitemaps to delete
$sitemapToDeleteArr = array_diff($fullSitemapNameArr, $this->sitemapNameArr);
// delete old missing sitemaps
foreach ($sitemapToDeleteArr as $value) {
$this->delete($value);
$fileName = $this->getFileName($value);
$this->writeLog('Deleted '.$fileName);
}
if ($this->checkSitemapSize() !== true){
throw new Exception('checkSitemapSize failed');
}
// set new sitemap is available
$this->newSitemapAvailable();
// rewrite robots.txt
$this->getRewriteRobots();
$this->getTotalUrls();
$this->getExtUrls();
// print type list if setted to true
if (PRINTTYPELIST === true) {
$this->getTypeList();
}
// print changefreq list if setted to true
if (PRINTCHANGEFREQLIST === true) {
$this->getChangefreqList();
}
// print priority list if setted to true
if (PRINTPRIORITYLIST === true) {
$this->getPriorityList();
}
// print malformed list if setted to true
if (PRINTMALFURLS === true) {
$this->getMalfList();
}
$endTime = time();
$this->writeLog('Total execution time '.gmdate('H:i:s', $endTime - $this->startTime));
$this->writeLog('##### Execution end'.PHP_EOL.PHP_EOL);
// update last execution params on getSeoSitemapExec
$this->query = "UPDATE getSeoSitemapExec "
. "SET version = '$this->version', mDate = '$endTime', exec = 'n', totUrls = '$this->totUrls' WHERE func = 'getSeoSitemap'";
$this->execQuery();
$this->mysqli->close();
}
//################################################################################
//################################################################################
private function resetVars(){
$this->resetVars2();
// reset row
$this->row = [];
}
//################################################################################
//################################################################################
private function resetVars2(){
$this->size = 0;
$this->md5 = null;
$this->lastmod = null;
$this->changefreq = null;
$this->state = null;
$this->httpCode = null;
$this->insUrl = null;
$this->pageBody = null;
}
//################################################################################
//################################################################################
private function writeLog($logMsg, $file = null, $line = null) {
if (($ob = DateTime::createFromFormat('U.u', sprintf('%.6f', microtime(true)))) === false){
error_log('writeLog/DateTime/createFromFormat failed', 0);
$date = 'unknown time';
}
elseif ($ob->setTimeZone(new DateTimeZone('Europe/Rome')) === false){
error_log('writeLog/setTimeZone failed', 0);
$date = 'unknown time';
}
elseif (($date = $ob->format('Y-m-d H:i:s.u')) === false){
error_log('writeLog/format failed', 0);
$date = 'unknown time';
}
$msgLine = '['.$date.'] '.$logMsg.PHP_EOL;
if ($file !== null || $line !== null) {
$msgLine .= ' [File '.$file.'] [Line '.$line.']';
}
if (file_put_contents($this->logPath, $msgLine, FILE_APPEND | LOCK_EX) === false) {
error_log('getSeoSitemap execution has been stopped because of writeLog/file_put_contents cannot write file '.$this->logPath, 0);
$this->stopExec();
}
}
//################################################################################
//################################################################################
private function setPriority(){
$this->optimCheck();
$this->query = "UPDATE getSeoSitemap SET priority = '".DEFAULTPRIORITY."' WHERE state != 'skip' AND state != 'rSkip'";
$this->execQuery();
foreach (PARTIALURLPRIORITY as $key => $value) {
foreach ($value as $v) {
$this->optimCheck();
$this->query = "UPDATE getSeoSitemap SET priority = '".$key."' "
. "WHERE url LIKE '".$v."%' AND state NOT IN ('skip', 'mSkip', 'rSkip', 'niSkip', 'noSkip') AND httpCode = '200'";
$this->execQuery();
}
}
foreach (FULLURLPRIORITY as $key => $value) {
foreach ($value as $v) {
$this->optimCheck();
$this->query = "UPDATE getSeoSitemap SET priority = '".$key."' "
. "WHERE url = '".$v."' AND state NOT IN ('skip', 'mSkip', 'rSkip', 'niSkip', 'noSkip') AND httpCode = '200'";
$this->execQuery();
}
}
$this->writeLog('Set priority');
}
//################################################################################
//################################################################################
private function getPriority(){
$priority = array_merge(array_keys(PARTIALURLPRIORITY), array_keys(FULLURLPRIORITY));
$priority[] = DEFAULTPRIORITY;
$priority = array_unique($priority);
rsort($priority);
foreach ($priority as $value) {
$this->query = "SELECT SQL_NO_CACHE COUNT(*) AS count FROM getSeoSitemap "
. "WHERE priority = '".$value."' AND state NOT IN ('skip', 'mSkip', 'rSkip', 'niSkip', 'noSkip') AND httpCode = '200'";
$this->execQuery();
$this->writeLog("Set priority ".$value." to ".$this->count." URLs into sitemap");
}
}
//################################################################################
//################################################################################
private function getTotalUrls() {
$this->writeLog('################################');
$this->writeLog('Included '.$this->totUrls.' URLs into sitemap');
$this->writeLog('################################'.PHP_EOL);
}
//################################################################################
//################################################################################
private function newSitemapAvailable(){
$this->query = "UPDATE getSeoSitemapExec SET newData = 'y' WHERE func = 'getSeoSitemap'";
$this->execQuery();
}
//################################################################################
//################################################################################
private function getIntUrls() {
$this->query = "SELECT SQL_NO_CACHE url, callerUrl FROM getSeoSitemap WHERE state IN ('skip', 'rSkip', 'niSkip', 'noSkip') "
. "AND url LIKE '".DOMAINURL."%'";
$this->execQuery();
// print list of URLs into domain out of sitemap if PRINTSKIPURLS === true
if (PRINTSKIPURLS === true) {
$this->writeLog('##### URLs into domain out of sitemap');
if ($this->rowNum > 0) {
asort($this->row);
foreach ($this->row as $value) {
$this->writeLog('URL: '.$value['url'].' - caller URL: '.$value['callerUrl']);
}
}
$this->writeLog('##########');
}
$this->writeLog($this->rowNum.' URLs into domain out of sitemap'.PHP_EOL);
}
//################################################################################
//################################################################################
private function getExtUrls() {
$this->query = "SELECT SQL_NO_CACHE url, callerUrl FROM getSeoSitemap WHERE state = 'skip' AND url NOT LIKE '".DOMAINURL."%'";
$this->execQuery();
// print list of URLs out of domain out of sitemap if PRINTSKIPURLS === true
if (PRINTSKIPURLS === true) {
$this->writeLog('##### URLs out of domain out of sitemap');
if ($this->rowNum > 0) {
asort($this->row);
foreach ($this->row as $value) {
$this->writeLog('URL: '.$value['url'].' - caller URL: '.$value['callerUrl']);
}
}
$this->writeLog('##########');
}
$this->writeLog($this->rowNum.' URLs out of domain out of sitemap');
}
//################################################################################
//################################################################################
private function checkSkipUrls() {
$this->query = "SELECT SQL_NO_CACHE url FROM getSeoSitemap WHERE state IN ('skip', 'rSkip', 'niSkip', 'noSkip')";
$this->execQuery();
if ($this->rowNum > 0) {
$this->stmt6 = $this->mysqli->prepare("UPDATE getSeoSitemap SET "
. "size = ?, "
. "httpCode = ? "
. "WHERE url = ?");
foreach ($this->row as $value) {
$url = $value['url'];
$this->getPage($url);
$this->checkPageSize($url);
$this->optimCheck();
$this->stmt6->bind_param('sss', $this->size, $this->httpCode, $url);
$this->stmt6->execute();
}
$this->stmt6->close();
}
$this->writeLog('Checked skipped URLs');
}
//################################################################################
//################################################################################
private function insNewUrl($url){
$this->resetVars();
// set skipCallerUrl to prepare pageTest in case of calling insSkipUrl from pageTest
$this->skipCallerUrl = $this->callerUrl;
$this->pageTest($url);
if ($this->insUrl === true) {
$this->insUpdNewUrlQuery($url);
}
}
//################################################################################
//################################################################################
private function insUpdNewUrlQuery($url){
$this->checkUrlLength($url);
$this->optimCheck();
$this->stmt2->bind_param('sss', $url, $this->callerUrl, $this->callerUrl);
$this->stmt2->execute();
}
//################################################################################
//################################################################################
private function linksScan(){
foreach ($this->pageLinks as $url) {
$this->insNewUrl($url);
}