This repository has been archived by the owner on Jan 5, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtweets.php
executable file
·4069 lines (3654 loc) · 147 KB
/
tweets.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/php
<?php
/**
* tweets.php - CLI script for manipulating an un-zipped twitter full backup data dump
* relies on command-line tools, tested on MacOS. To view the grailbird output files use
* https://github.com/vijinho/tweets-gb.
*
* @author Vijay Mahrra <vijay@yoyo.org>
* @copyright (c) Copyright 2018 Vijay Mahrra
* @license GPLv3 (http://www.gnu.org/licenses/gpl-3.0.html)
*/
ini_set('default_charset', 'utf-8');
ini_set('mbstring.encoding_translation', 'On');
ini_set('mbstring.func_overload', 6);
//-----------------------------------------------------------------------------
// required commands check
$requirements = [
'find' => 'cli command: find',
'grep' => 'cli command: grep',
'cut' => 'cli command: cut',
'xargs' => 'cli command: xargs',
'gunzip' => 'cli command: gunzip',
'convert' => 'tool: convert - https://imagemagick.org/script/convert.php',
'curl' => 'tool: curl - https://curl.haxx.se',
'wget' => 'tool: wget - https://www.gnu.org/software/wget/',
];
$commands = get_commands($requirements);
if (empty($commands)) {
verbose('Error: Missing commands.', $commands);
exit;
}
//-----------------------------------------------------------------------------
// define command-line options
// see https://secure.php.net/manual/en/function.getopt.php
// : - required, :: - optional
$options = getopt('hvdtf:g:i:auolxr:k:',
[
'help',
'verbose',
'debug',
'test',
'dir:',
'dir-output:',
'format:',
'filename:',
'grailbird:',
'copy-media:',
'grailbird-media',
'grailbird-import:',
'list',
'list-js',
'list-images',
'list-videos',
'list-users',
'list-missing-media',
'organize-media',
'download-missing-media',
'list-profile-images',
'download-profile-images',
'tweets-file:',
'tweets-count',
'tweets-all',
'date-from:',
'date-to:',
'regexp:',
'regexp-save:',
'no-retweets',
'no-mentions',
'media-only',
'urls-expand',
'urls-resolve',
'urls-check',
'urls-check-source',
'urls-check-force',
'offline',
'local',
'media-prefix:',
'delete',
'dupes',
'keys-required:',
'keys-remove:',
'keys-filter:',
'thread:',
'threads-tweets:',
'minimal',
]);
$do = [];
foreach ([
'verbose' => ['v', 'verbose'],
'test' => ['t', 'test'],
'debug' => ['d', 'debug'],
'test' => ['t', 'test'],
'grailbird' => ['g', 'grailbird'],
'copy-media' => [null, 'copy-media'],
'grailbird-media' => [null, 'grailbird-media'],
'grailbird-import' => [null, 'grailbird-import'],
'list' => [null, 'list'],
'list-js' => [null, 'list-js'],
'list-images' => [null, 'list-images'],
'list-videos' => [null, 'list-videos'],
'list-users' => [null, 'list-users'],
'list-missing-media' => [null, 'list-missing-media'],
'organize-media' => [null, 'organize-media'],
'download-missing-media' => [null, 'download-missing-media'],
'list-profile-images' => [null, 'list-profile-images'],
'download-profile-images' => [null, 'download-profile-images'],
'tweets-count' => [null, 'tweets-count'],
'tweets-all' => ['a', 'tweets-all'],
'no-retweets' => [null, 'no-retweets'],
'no-mentions' => [null, 'no-mentions'],
'media-only' => [null, 'media-only'],
'urls-expand' => [null, 'urls-expand'],
'urls-resolve' => ['u', 'urls-resolve'],
'urls-check' => [null, 'urls-check'],
'urls-check-force' => [null, 'urls-check-force'],
'urls-check-source' => [null, 'urls-check-source'],
'offline' => ['o', 'offline'],
'local' => ['l', 'local'],
'media-prefix' => [null, 'media-prefix'],
'unlink' => ['x', 'delete'],
'dupes' => [null, 'dupes'],
'keys-required' => [null, 'keys-required'],
'keys-remove' => ['r', 'keys-remove'],
'keys-filter' => ['k', 'keys-filter'],
'thread' => [null, 'thread'],
'threads-tweets' => [null, 'threads-tweets'],
'minimal' => [null, 'minimal']
] as $i => $opts) {
$do[$i] = (int) (array_key_exists($opts[0], $options) || array_key_exists($opts[1],
$options));
}
if (array_key_exists('debug', $do) && !empty($do['debug'])) {
$do['verbose'] = $options['verbose'] = 1;
}
if (array_key_exists('media-prefix', $options)) {
$do['local'] = $options['local'] = 1;
}
if (array_key_exists('list-missing-media', $do) || array_key_exists('organize-media',
$do)) {
$do['tweets-all'] = $options['tweets-all'] = 1;
$do['local'] = $options['local'] = 1;
}
if (array_key_exists('urls-check-source', $options) || array_key_exists('urls-check-force',
$options)) {
$do['urls-check'] = $options['urls-check'] = 1;
}
if (array_key_exists('urls-check', $options)) {
$do['urls-resolve'] = $options['urls-resolve'] = 1;
}
if (array_key_exists('urls-resolve', $options)) {
$do['urls-expand'] = $options['urls-expand'] = 1;
}
if ($do['thread']) {
$thread_id = (int) $options['thread'];
if ($thread_id < 1) {
$thread_id = 0;
}
$do['thread'] = $thread_id;
}
if (array_key_exists('threads-tweets', $options)) {
$do['threads-tweets'] = 1;
}
if ($do['threads-tweets']) {
$threads_tweets = (int) $options['threads-tweets'];
if ($threads_tweets < 1) {
$threads_tweets = 2;
}
$do['threads-tweets'] = $threads_tweets;
}
if (!empty($do['copy-media'])) {
$do['copy-media'] = $options['copy-media'];
$do['local'] = 1;
}
if (!empty($do['minimal'])) {
$do['minimal'] = true;
}
//-----------------------------------------------------------------------------
// defines (int) - forces 0 or 1 value
define('DEBUG', (int) $do['debug']);
define('VERBOSE', (int) $do['verbose']);
define('TEST', (int) $do['test']);
define('UNLINK', (int) $do['unlink']);
define('OFFLINE', (int) $do['offline']);
debug('COMMANDS:', $commands);
debug('OPTIONS:', $do);
//-----------------------------------------------------------------------------
// help
if (empty($options) || array_key_exists('h', $options) || array_key_exists('help',
$options)) {
options:
$readme_file = 'README.md';
if (file_exists($readme_file)) {
$readme = file_get_contents('README.md');
if (!empty($readme)) {
output($readme . "\n");
}
}
echo "Requirements:\n";
foreach ($requirements as $cmd => $desc) {
printf("%s:\n\t%s\n", $cmd, $desc);
}
echo join("\n",
[
'Usage: tweets.php',
'Manipulates tweets taken from an exported twitter archive.',
"(Specifying any other unknown argument options will be ignored.)\n",
"\t-h, --help Display this help and exit",
"\t-v, --verbose Run in verbose mode",
"\t-d, --debug Run in debug mode (implies also -v, --verbose)",
"\t-t, --test Run in test mode, show what would be done, NO filesystem changes.",
"\t --dir={.} Directory of unzipped twitter backup files (current dir if not specified)",
"\t --dir-output={.} Directory to output files in (default to -dir above)",
"\t --format={json} Output format for script data: txt|md|php|json (default)",
"\t-f, --filename={output.} Filename for output data from operation, default is 'output.{--OUTPUT_FORMAT}'",
"\t --grailbird-import={dir} Import in data from the grailbird json files of the standard twitter export. If specified with '-a' will merge into existing tweets before outputting new file.",
"\t-g, -g={dir} Generate json output files compatible with the standard twitter export feature to dir",
"\t --copy-media= Copy local media files to the given folder",
"\t --grailbird-media Copy local media files to grailbird folder, using same file path",
"\t --media-prefix Prefix to local media folder instead of direct file:// path, e.g. '/' if media folders are to be replicated under webroot for serving via web and prefixing a URL path, implies --local",
"\t --list Only list all files in export folder and halt - filename",
"\t --list-js Only List all javascript files in export folder and halt",
"\t --list-images Only list all image files in export folder and halt",
"\t --list-videos Only list all video files in export folder and halt",
"\t --list-users Only list all users in tweets, (default filename 'users.json') and halt",
"\t --list-missing-media List media URLs for which no local file exists and halt (implies --local)",
"\t --organize-media Organize local downloaded media, for example split folder into date/month subfolders",
"\t --download-missing-media Download missing media (from --list-missing-media) and halt, e.g.. missing media files (implies --local)",
"\t --list-profile-images Only list users profile images, (in filename 'users.json') and halt",
"\t --download-profile-images WARNING: This can be a lot of users! Download profile images.",
"\t --tweets-count Only show the total number of tweets and halt",
"\t-i, --tweets-file={tweets.js} Load tweets from different json input file instead of default twitter 'tweets.js' or 'tweets.json' (priority if exists)",
"\t-a, --tweets-all Get all tweets (further operations below will depend on this)",
"\t --date-from Filter tweets from date/time, see: https://secure.php.net/manual/en/function.strtotime.php",
"\t --date-to Filter tweets up-to date/time, see: https://secure.php.net/manual/en/function.strtotime.php ",
"\t --no-retweets Drop re-tweets (RT's)",
"\t --no-mentions Drop tweets starting with mentions",
"\t --minimal Minimal output for each tweet, no superfluous data like tweet IDs.",
"\t --media-only Only media tweets",
"\t --urls-expand Expand URLs where shortened and data available (offline) in tweet (new attribute: text)",
"\t-u, --urls-resolve Unshorten and dereference URLs in tweet (in new attribute: text) - implies --urls-expand",
"\t --urls-check Check every single target url (except for twitter.com and youtube.com) and update - implies --urls-resolve",
"\t --urls-check-source Check failed source urls - implies --urls-resolve",
"\t --urls-check-force Forcibly checks every single failed (numeric) source and target url and update - implies --urls-check",
"\t-o, --offline Do not go-online when performing tasks (only use local files for url resolution for example)",
"\t-l, --local Fetch local file information (if available) (new attributes: images,videos,files)",
"\t-x, --delete DANGER! At own risk. Delete files where savings can occur (i.e. low-res videos of same video), run with -t to test only and show files",
"\t --dupes List (or delete) duplicate files. Requires '-x/--delete' option to delete (will rename duplicated file from '{tweet_id}-{id}.{ext}' to '{id}.{ext}). Preview with '--test'!",
"\t --keys-required=k1,k2,. Returned tweets which MUST have all of the specified keys",
"\t-r, --keys-remove=k1,k2,. List of keys to remove from tweets, comma-separated (e.g. 'sizes,lang,source,id_str')",
"\t-k, --keys-filter=k1,k2,. List of keys to only show in output - comma, separated (e.g. id,created_at,text)",
"\t --regexp='/<pattern>/i' Filter tweet text on regular expression, i.e /(google)/i see https://secure.php.net/manual/en/function.preg-match.php",
"\t --regexp-save=name Save --regexp results in the tweet under the key 'regexps' using the key/id name given",
"\t --thread=id Returned tweets for the thread with id",
"\t --threads-tweets={n} *BROKEN* When exporting markdown, save threads to markdown files which have a minimum of n tweets",
"\nExamples:",
"\nReport duplicate tweet media files and output to 'dupes.json':",
"\ttweets.php -fdupes.json --dupes",
"\nDelete duplicate tweet media files (will rename them from '{tweet_id}-{id}.{ext}' to '{id}.{ext})':",
"\ttweets.php --delete --dupes",
"\nShow total tweets in tweets file:",
"\ttweets.php --tweets-count --format=txt",
"\nWrite all users mentioned in tweets to default file 'users.json':",
"\ttweets.php --list-users",
"\nShow javascript files in backup folder:",
"\ttweets.php -v --list-js",
"\nResolve all URLs in 'tweets.js' file, writing output to 'tweets.json':",
"\ttweets.php -v -u --filename=tweets.json",
"\nResolve all URLs in 'tweets.js' file, writing output to grailbird files in 'grailbird' folder and also 'tweets.json':",
"\ttweets.php -u --filename=tweets.json -g=export/grailbird",
"\nGet tweets from 1 Jan 2017 to 'last friday', only id, created and text keys:",
"\ttweets.php -d -v -o -u --keys-filter=id,created_at,text,files --date-from '2017-01-01' --date-to='last friday'",
"\nList URLs for which there are missing local media files:",
"\ttweets.php -v --list-missing-media",
"\nDownload files from URLs for which there are missing local media files:",
"\ttweets.php -v --download-missing-media",
"\nOrganize 'tweet_media' folder into year/month subfolders:",
"\ttweets.php -v --organize-media",
"\nPrefix the local media with to a URL path 'assets':",
"\ttweets.php -v --media-prefix='/assets'",
"\nGenerate grailbird files with expanded/resolved URLs:",
"\ttweets.php -v -u -g=export/grailbird",
"\nGenerate grailbird files with expanded/resolved URLs using offline saved url data - no fresh checking:",
"\ttweets.php -v -o -u -g=export/grailbird",
"\nGenerate grailbird files with expanded/resolved URLs using offline saved url data and using local file references where possible:",
"\ttweets.php -v -o -u -l -g=export/grailbird",
"\nGenerate grailbird files with expanded/resolved URLs using offline saved url data and using local file references, dropping retweets:",
"\ttweets.php -v -o -u -l -g=export/grailbird --no-retweets",
"\nFilter tweet text on word 'hegemony' since last year, exporting grailbird:",
"\ttweets.php -v -o -u -l -g=export/grailbird --regexp='/(hegemony)/i' --regexp-save=hegemony",
"\nExtract the first couple of words of the tweet and name the saved regexp 'words':",
"\ttweets.php -v -o -u -l -x -g=export/grailbird --regexp='/^(?P<first>[a-zA-Z]+)\s+(?P<second>[a-zA-Z]+)/i' --regexp-save=words",
"\nImport grailbird tweets and export tweets with local media files to web folder:",
"\ttweets.php -v -g=www/vijinho/ --media-prefix='/vijinho/' --grailbird-media --grailbird-import=vijinho/import/data/js/tweets",
"\nImport twitter grailbird files,check URL and export new grailbird files:",
"\ttweets.php -v -g=www/vijinho/ --grailbird-import=import/data/js/tweets --urls-check",
"\nImport and merge grailbird files from 'import/data/js/tweets', fully-resolving links and local files:",
"\ttweets.php -v -o -l -u --grailbird-import=import/data/js/tweets -g=export/grailbird",
"\nExport only tweets which have the 'withheld_in_countries' key to export/grailbird folder:",
"\ttweets.php -v -u -o --keys-required='withheld_in_countries' -g=export/grailbird",
"\nExport only tweets containing text 'youtu':",
"\ttweets.php -v --regexp='/youtu/' -g=www/vijinho/ --media-prefix='/vijinho/' --grailbird-media",
"\nExport only no mentions, no RTs':",
"\ttweets.php -v -g=www/vijinho/ --media-prefix='/vijinho/' --grailbird-media --no-retweets --no-mentions",
"\nExport only media tweets only':",
"\ttweets.php -v -g=www/vijinho/ --media-prefix='/vijinho/' --grailbird-media --media-only",
"\nExport the tweet thread 967915766195609600 as a markdown file in the current directory:",
"\ttweets.php -v --no-retweets --no-mentions --format=md --thread=967915766195609600 --filename=967915766195609600.md",
"\nExport the tweet thread 967915766195609600 as a markdown file with media in the directory 967915766195609600:",
"\ttweets.php -v --no-retweets --no-mentions --format=md --thread=967915766195609600 --filename=967915766195609600/967915766195609600.md --copy-media=967915766195609600",
"\nExport the tweet thread 967915766195609600 as grailbird export files, to tweets to thread.json and folder called thread:",
"\ttweets.php -v --thread=967915766195609600 --filename=www/thread/data/js/thread.json -g=www/thread/ --media-prefix='/thread/' --grailbird-media",
"\nExport the tweet thread 967915766195609600 as a js file test/test.json, and copy media files too:",
"\ttweets.php -v --dir=vijinho --thread=1108500373298442240 --filename=test/test.json --copy-media=test",
"\nExport the tweet thread 967915766195609600 as markdown, and copy media files too:",
"\ttweets.php -d -v --dir=vijinho --thread=967915766195609600 --filename=thread/vijinho_967915766195609600_md/item.md --media-prefix=/vijinho_967915766195609600_md/ --copy-media=thread/vijinho_967915766195609600_md --format=md",
"\nResolve URLs from tweets.js/tweets.json file and create a complete grailbird-data export, creating a new tweets.json file after to",
"\ttweets.php -v -d --date-from '2019-05-01' --urls-expand --urls-resolve --grailbird-media --media-prefix='/' --grailbird=grailbird --filename='tweets.json'",
"\nGenerate markdown output file of all tweets except RTs and mentions for threads which have at least 10 tweets",
"\ttweets.php -v -d --no-retweets --no-mentions --format=md --filename=output.md --threads-tweets=10",
]) . "\n";
// goto jump here if there's a problem
errors:
if (!empty($errors)) {
if (is_array($errors)) {
output("Error(s):\n\t- " . join("\n\t- ", $errors) . "\n");
} else {
print_r($errors);
exit;
}
} else {
output("No errors occurred.\n");
}
exit;
}
//-----------------------------------------------------------------------------
// url manipulation and handling variables
$url_shorteners = [// dereference & update URLs if moved or using shortener which is not twitters
'53eig.ht', 'aca.st', 'amzn.to', 'b-o-e.uk', 'b0x.ee', 'bankofeng.uk',
'bbc.in', 'bit.ly',
'bitly.com', 'bloom.bg', 'boe.uk', 'bru.gl', 'buff.ly', 'cnb.cx', 'cnnmon.ie',
'dailym.ai',
'deck.ly', 'dld.bz',
'dlvr.it', 'econ.st', 'eff.org', 'eurone.ws', 'fal.cn', 'fb.me', 'for.tn', 'go.nasa.gov',
'go.shr.lc',
'goo.gl', 'ht.ly', 'hubs.ly', 'huff.to', 'ind.pn', 'instagr.am',
'interc.pt',
'j.mp', 'jrnl.ie', 'jtim.es', 'kurl.nl', 'ln.is',
'n.mynews.ly', 'newsl.it', 'n.pr',
'nyp.st', 'nyti.ms', 'on.fb.me', 'on.ft.com', 'on.mktw.net', 'on.rt.com', 'on.wsj.com',
'ow.ly', 'owl.li',
'po.st', 'poal.me', 'ptv.io', 'read.bi', 'reut.rs', 'rviv.ly', 'sc.mp', 'scl.io',
'shr.gs', 'shar.es',
'socsi.in', 'spon.de',
'spoti.fi', 'spr.ly', 'sptnkne.ws', 'str.sg', 't.co', 'tgam.ca', 'ti.me', 'tinurl.us',
'tinyurl.com',
'tlsur.net', 'tmblr.co', 'tr.im', 'trib.al', 'tws.io', 'vrge.co', 'wapo.st',
'wef.ch', 'wp.me',
'wpo.st', 'wrd.cm', 'wrld.bg', 'www.goo.gl', 'xhne.ws', 'yhoo.it', 'youtu.be',
];
/*
* Bad shorteners, trouble resolving:
* amzn.com
* gu.com
* is.gd
* lnkd.in
* min.ie
*/
// expired domains or domains we do not want to follow
$hosts_expired = [
'b0x.ee', '4sq.com', 'vid.me',
];
// return codes from curl (url_resolve() function below) which indiciate we should not try to resolve a url
// -22 signifies a wget failure, the rest are from curl
// https://ec.haxx.se/usingcurl-returns.html
$curl_errors_dead = [3, 6, 7, 18, 28, 47, 52, 56, -22];
//-----------------------------------------------------------------------------
// initialise variables
$errors = []; // errors to be output if a problem occurred
$output = []; // data to be output at the end
$save_every = OFFLINE ? 1000 : 350; // save results every so often when looping, e.g. urls checked online
$online_sleep_under = OFFLINE ? 0 : 0.2; // sleep if under this many seconds elapsed performing online operation
$online_sleep = OFFLINE ? 0 : 0.1; // time to wait between each online operation
debug('save_every: ' . $save_every);
debug('online_sleep_under: ' . $online_sleep_under);
debug('online_sleep: ' . $online_sleep);
$tweets = [];
$tweets_count = 0;
$missing_media = []; // missing local media files, [filename => source url]
//-----------------------------------------------------------------------------
// set the script output format to one of (json, php, text)
$format = '';
if (!empty($options['format'])) {
$format = $options['format'];
}
switch ($format) {
case 'txt':
case 'php':
case 'md':
break;
default:
case 'json':
$format = 'json';
}
define('OUTPUT_FORMAT', $format);
verbose(sprintf('OUTPUT_FORMAT: %s', $format));
//-----------------------------------------------------------------------------
// get dir to read unzipped twitter backup archive files from
$dir = '.';
if (!empty($options['dir'])) {
$dir = $options['dir'];
}
$dircheck = realpath($dir);
if (empty($dircheck) || !is_dir($dircheck)) {
$errors[] = 'You must specify a valid directory!';
goto errors;
}
verbose(sprintf('TWEET DIR: %s', $dir));
//-----------------------------------------------------------------------------
// tweets data filename
if (!empty($options['i'])) {
$tweets_file = $options['i'];
} elseif (!empty($options['tweets-file'])) {
$tweets_file = $options['tweets-file'];
} else {
$tweets_file = 'tweets.js';
if (file_exists('tweets.json')) {
$tweets_file = 'tweets.json';
}
}
verbose(sprintf('TWEETS FILENAME: %s', $tweets_file));
//-----------------------------------------------------------------------------
// output data filename
$output_filename = '';
if (!empty($options['f'])) {
$output_filename = $options['f'];
} elseif (!empty($options['filename'])) {
$output_filename = $options['filename'];
}
if (empty($output_filename) || is_array($output_filename)) {
$output_filename = 'output.json';
}
if (!empty($output_filename)) {
verbose(sprintf('OUTPUT FILENAME: %s', $output_filename));
}
//-----------------------------------------------------------------------------
// local media prefix
if (!empty($options['media-prefix'])) {
$media_prefix = $options['media-prefix'];
}
//-----------------------------------------------------------------------------
// users data filename
if ($do['list-users']) {
$users_filename = empty($output_filename) ? 'users.json' : $output_filename;
} else {
$users_filename = $dir . '/users.json';
}
//-----------------------------------------------------------------------------
// get date from/to from command-line
if (!empty($options['date-from'])) {
$date_from = $options['date-from'];
}
if (!empty($date_from)) {
$date_from = strtotime($date_from);
if (false === $date_from) {
$errors[] = sprintf('Unable to parse --date-from: %s',
$options['date-from']);
}
verbose(sprintf("Filtering tweets FROM date/time '%s': %s",
$options['date-from'], date('r', $date_from)));
}
if (!empty($options['date-to'])) {
$date_to = $options['date-to'];
}
if (!empty($date_to)) {
$date_to = strtotime($date_to);
if (false === $date_to) {
$errors[] = sprintf('Unable to parse --date-to: %s', $options['date-to']);
}
verbose(sprintf("Filtering tweets TO date/time '%s': %s",
$options['date-to'], date('r', $date_to)));
}
//-----------------------------------------------------------------------------
// get regexp
if (!empty($options['regexp'])) {
$regexp = $options['regexp'];
}
if (!empty($regexp)) {
if (false === preg_match($regexp, null)) {
$errors[] = sprintf('Unable to validate regular expression: %s',
$options['regexp']);
}
verbose(sprintf("Filtering tweets with regular expression '%s'",
$options['regexp']));
}
$regexp_save = array_key_exists('regexp-save', $options) ? $options['regexp-save']
: false;
if (!empty($errors)) {
goto errors;
}
//-----------------------------------------------------------------------------
// pre-fetch all files in advance if a list command-line option was specified
if ($do['list'] || $do['local'] || $do['dupes']) {
debug('Fetching files list from: ' . $dir);
$files = files_list($dir);
if (empty($files)) {
$errors[] = 'No files found!';
goto errors;
}
} else {
$files = [];
}
if ($do['list-images'] || $do['local']) {
verbose('Fetching images list…');
$images = files_images($dir);
if ($do['list-images']) {
debug('Image files:', $images);
$output = $images;
goto output;
}
}
if ($do['list-videos'] || $do['local']) {
verbose('Fetching videos list…');
$videos = files_videos($dir);
if ($do['list-videos']) {
debug('Video files:', $videos);
$output = $videos;
goto output;
}
}
if ($do['list-js'] || $do['local']) {
verbose('Fetching js list…');
$js = files_js($dir);
if ($do['list-js']) {
debug('Javascript files:', $js);
$output = $js;
goto output;
}
}
//-----------------------------------------------------------------------------
// prepare arrays for file list data
// $files, $images, $videos, $js and append to $output
if ($do['list']) {
verbose('Listing files…');
debug('Files:', $files);
$output = $files;
goto output;
}
//-----------------------------------------------------------------------------
// delete duplicates
if ($do['dupes']) {
verbose('Finding duplicate files...');
// create file keys index of key => paths
$keys = [];
foreach ($files as $file => $path) {
// split on - because filename is {tweet_id}-{media_id}.{ext}
if (!preg_match("/(?P<tweet_id>^[\d]+)-(?P<key>[^\.]+)\.(?P<ext>.+)/",
$file, $parts)) {
continue;
}
$key = $parts['key']; // e.g. EYt4vLLw.jpg
if (array_key_exists($key, $keys)) {
verbose(sprintf("Duplicate file found: %s\n\t%s\n\t%s", $key,
$keys[$key][0], $path));
}
$keys[$key][] = $path;
}
// filter file keys to remove where only 1 match occurred for the key
foreach ($keys as $key => $paths) {
// skip where the file only occurred once
if (1 === count($paths)) {
unset($keys[$key]);
continue;
}
}
if (empty($keys)) {
verbose('No duplicate files found.');
goto output;
}
verbose(sprintf('Files duplicated: %d', count($keys)));
// go to end if no --delete specified
$output = $keys;
if (!UNLINK) {
goto output;
}
// we are going to delete unless used with --test
if (TEST) {
verbose('TEST: No files will actually be deleted!');
}
$deletes = [];
// find deletable non tweets_media files
foreach ($keys as $filename => $paths) {
foreach ($paths as $p => $path) {
// delete the 'direct_message_media' and 'moments_tweets_media' dupe files first but not 'media_tweets'
if (false !== stristr($path, '/direct_message_media/') ||
false !== stristr($path, '/moments_tweets_media/')) {
$deletes[] = $path;
unset($paths[$p]);
}
}
sort($paths); // need to do this to reset the index numbering to 0, 1, 2...
$keys[$filename] = $paths;
}
// find all other duplicated files to delete and also rename
$renames = [];
foreach ($keys as $key => $paths) {
if (1 === count($paths)) {
// we only have 1 file left for the key, so we keep it
// rename the file now to {id}.{ext}
$renames[$paths[0]] = stristr($paths[0], $key);
continue;
}
// keep the first file
$renames[$paths[0]] = stristr($paths[0], $key);
unset($paths[0]); // remove first element
if (empty($paths)) {
continue;
}
// all other files for the key can be left can be deleted
foreach ($paths as $p => $path) {
$deletes[] = $path;
}
unset($keys[$key]);
}
ksort($renames);
if (DEBUG) {
debug(sprintf('Files to rename: %d', count($renames)), $renames);
} else {
verbose(sprintf('Files to rename: %d', count($renames)));
}
foreach ($renames as $from => $to) {
// prepend path of $from file to $to before renaming
$to = substr($from, 0, strrpos($from, '/') + 1) . $to;
if (TEST) {
verbose("Renaming (NOT!): $from => $to");
} else {
verbose("Renaming: $from => $to");
if (!rename($from, $to)) {
$errors[] = "Error renaming file: $from => $to";
}
}
}
ksort($deletes);
if (DEBUG) {
debug(sprintf('Files to delete: %d', count($deletes)), $deletes);
} else {
verbose(sprintf('Files to delete: %d', count($deletes)));
}
foreach ($deletes as $path) {
if (TEST) {
verbose('Deleting (NOT!): ' . $path);
} elseif (UNLINK) {
verbose('Deleting: ' . $path);
if (!unlink($path)) {
$errors[] = "Error deleting file: $path";
}
}
}
if (empty($errors)) {
goto errors;
}
$output = [];
goto output;
}
//-----------------------------------------------------------------------------
// return total number of tweets
$tweets = [];
$tweets_count = 0;
if ($do['tweets-count']) {
verbose('Counting tweets…');
$tweets_count = tweets_count($dir);
$output = [$tweets_count];
verbose("Tweets Count: $tweets_count");
goto output;
}
//-----------------------------------------------------------------------------
// load account details
$account_file = 'account.js';
verbose(sprintf("Loading account details from '%s'", $account_file));
$account = json_load_twitter($dir, $account_file);
if (empty($account) || is_string($account)) {
$errors[] = 'No account file found!';
if (is_string($account)) {
$errors[] = 'JSON Error: ' . $account;
}
// goto errors;
} else {
$account = $account[0]['account'];
verbose('Account details loaded:', $account);
}
//-----------------------------------------------------------------------------
// fetch tweets - all
if ($do['tweets-all'] || $do['list-users']) {
// load in all tweets
verbose(sprintf("Loading tweets from '%s'", $tweets_file));
$tweets = json_load_twitter($dir, $tweets_file);
if (empty($tweets) || is_string($tweets)) {
verbose("No tweets found loading tweets from: $tweets_file");
}
if (empty($tweets)) {
$tweets_count = count($tweets);
verbose(sprintf('Tweets loaded: %d', $tweets_count));
}
}
//-----------------------------------------------------------------------------
// get directory for importing grailbird data and js files there-in
if ($do['grailbird-import']) {
$grailbird_import_dir = '';
if (!empty($options['grailbird-import'])) {
$grailbird_import_dir = $options['grailbird-import'];
} else {
$grailbird_import_dir = $dir . '/import/data/js/tweets';
}
$grailbird_import_dir = realpath($grailbird_import_dir);
if (empty($grailbird_import_dir) || !is_dir($grailbird_import_dir)) {
$errors[] = 'You must specify a valid grailbird import directory!';
goto errors;
}
verbose(sprintf('GRAILBIRD IMPORT DIR: %s', $grailbird_import_dir));
$grailbird_files = files_js($grailbird_import_dir);
if (empty($grailbird_files)) {
$errors[] = sprintf('No grailbird js files found to import in: %s!',
$grailbird_import_dir);
goto errors;
}
if (empty($grailbird_files) || !is_array($grailbird_files)) {
$grailbird_files = [];
} else {
ksort($grailbird_files);
// get directory for importing grailbird data and js files there-in
debug(sprintf("Importing tweets from '%s'", $grailbird_import_dir),
$grailbird_files);
if (empty($tweets) || !is_array($tweets)) {
$tweets = [];
}
foreach ($grailbird_files as $file => $path) {
// we only want the files which are yyyy_mm.js*
if (!preg_match("/^([\d]{4}_[\d]{2}\.js.*)/i", $file, $matches)) {
unset($grailbird_files[$file]);
continue;
}
$filename = basename($file);
$data = json_load_twitter($grailbird_import_dir, $filename);
if (!is_array($data)) {
$errors = sprintf('No data found in file: %s', $path);
goto errors;
}
debug(sprintf('Importing tweets from: %s', $path));
// merge each tweet
foreach ($data as $tweet_id => $tweet) {
unset($data[$tweet_id]);
$tweet_id = (int) $tweet_id;
// didn't exist, add to $tweets and continue
if (array_key_exists($tweet_id, $tweets)) {
// created_at is missing the time for most tweets before 2010/11
unset($tweet['created_at']);
} else {
//debug(sprintf('Adding new tweet: %d', $tweet_id));
$tweets[$tweet_id] = $tweet;
}
// already in $tweets, merge it
$tweets[$tweet_id] = array_replace_recursive($tweets[$tweet_id],
$tweet);
// we need to remove the 'retweeted_status' entry to the top level to match 'tweets.js'
if (array_key_exists('retweeted_status', $tweet)) {
//$tweet['text'] = sprintf('RT @%s: %s', $tweet_rt['user']['screen_name'], $tweet_rt['text']);
$tweet_rt = $tweet['retweeted_status'];
$tweet_rt['id'] = (int) $tweet_rt['id'];
$id = $tweet_rt['id'];
unset($tweet['retweeted_status']);
if (!array_key_exists($id, $tweets)) {
//$tweets[$id] = $tweet_rt;
} else {
$tweets[$id] = array_replace_recursive($tweet, $tweet_rt);
}
}
}
}
$tweets = array_column($tweets, null, 'id'); // re-index
ksort($tweets);
$save = json_save($output_filename, $tweets);
if (true !== $save) {
$errors[] = "\nFailed encoding JSON output file:\n\t$output_filename\n";
$errors[] = "\nJSON Error: $save\n";
goto errors;
} else {
verbose(sprintf("JSON written to output file:\n\t%s (%d bytes)\n",
$output_filename, filesize($output_filename)));
}
}
}
//-----------------------------------------------------------------------------
// directory for grailbird output
if ($do['grailbird']) {
if (!empty($options['g'])) {
$grailbird_dir = $options['g'];
} elseif (!empty($options['grailbird'])) {
$grailbird_dir = $options['grailbird'];
}
if (empty($grailbird_dir)) {
$grailbird_dir = $dir . 'export/grailbird';
}
if (!file_exists($grailbird_dir)) {
mkdir($grailbird_dir, 0777, true);
}
if (!is_dir($grailbird_dir)) {
$errors[] = 'You must specify a valid grailbird output directory!';
goto errors;
}
$grailbird_dir = realpath($grailbird_dir);
verbose(sprintf('GRAILBIRD OUTPUT DIR: %s', $grailbird_dir));
}
// copy files to target
if ($do['grailbird-media']) {
$do['copy-media'] = $grailbird_dir;
} else if (!empty($thread_id) && $thread_id > 0) {
$do['copy-media'] = $thread_id;
}
//-----------------------------------------------------------------------------
// load in (if previously saved) list of resolved urls => target
$file_urls = $dir . '/urls.json';
$urls = json_load($file_urls);
if (!is_string($urls)) {
verbose(sprintf("Loaded previously saved urls from:\n\t%s", $file_urls));
} else {
$errors[] = $urls; // non-fatal so continue
$urls = [];
}
verbose(sprintf('URLs loaded: %d', count($urls)));
// summarise the number of source urls and target urls by host and tidy-up urls
if (!empty($urls)) {
$src_hosts = [];
$target_hosts = [];
$unresolved = [];
$curl_errors = [];
foreach ($urls as $url => $target) {
$u = parse_url($url);
if (!empty($u['host'])) {
$src_hosts[] = $u['host'];
}
// remove urls
if (array_key_exists($target, $urls)) {
if ((!is_numeric($target) && $target === $urls[$target]) || false === parse_url($target)) {
unset($urls[$target]);
}
}
$t = parse_url($target);
if (array_key_exists('host', $t) && !empty($t['host'])) {
$target_hosts[] = $t['host'];
} elseif (1 == count($t)) {
$unresolved[$url] = $target;
if (!array_key_exists($target, $curl_errors)) {
$curl_errors[$target] = 0;
}
++$curl_errors[$target];
}
}
$src_hosts = array_count_values($src_hosts);
$target_hosts = array_count_values($target_hosts);
ksort($target_hosts);
ksort($unresolved);
ksort($curl_errors);
debug('All source URL hosts:', $src_hosts);
foreach ($target_hosts as $host => $count) {
if ($count < 25) {
unset($target_hosts[$host]);
}
}
debug('Most popular target hosts:', $target_hosts);
debug('Previous failed cURL targets:', $unresolved);
debug('Summary failed cURL errors:', $curl_errors);
foreach ($target_hosts as $host => $count) {
if (in_array($host, $url_shorteners)) {
debug(sprintf("Unresolved short URL TARGET: $host (%d)", $count));
if (array_key_exists($host, $src_hosts)) {
debug(sprintf("Resolved short URL SOURCE: $host (%d)",
$src_hosts[$host]));
}
}
}
unset($src_hosts);
unset($target_hosts);
unset($unresolved);
unset($curl_errors);
}
//-----------------------------------------------------------------------------
// list all users
verbose('Getting all users mentioned in tweets…');
$users = json_load($users_filename);
if (!is_string($users)) {
$users_count = count($users);
verbose(sprintf("Loaded %d previously saved users from '%s'", $users_count,
$users_filename));
} else {
$users = [];
$users_count = 0;
}