-
Notifications
You must be signed in to change notification settings - Fork 1
/
Thesis_draft0.2.tex
executable file
·995 lines (761 loc) · 82.8 KB
/
Thesis_draft0.2.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Master thesis draft %
%Maintainer: Christopher Chan%
%Version: 0.2.1 %
%Date: 2022-09-13 %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\documentclass[11pt, a4paper, twoside]{report}
\usepackage[utf8]{inputenc}
\usepackage[british]{babel}
\usepackage{graphicx}
\usepackage{blindtext}
\usepackage{amsmath}
\usepackage{titlesec}
\usepackage{hyperref}
\usepackage{float}
\usepackage{lscape}
\usepackage{csvsimple}
\usepackage{longtable}
\usepackage{geometry}
\restylefloat{table}
\usepackage{fancyhdr, ragged2e}
\fancyhead{}
\pagestyle{fancy}
%in the preamble
%--------------------------------
\usepackage[
backend=biber,
style=numeric,
sorting=anyvt
]{biblatex}
\bibliographystyle{ieeetr}
\addbibresource{HOTOSM.bib}
%--------------------------------
\hypersetup{
colorlinks=true,
linkcolor=blue,
filecolor=magenta,
urlcolor=cyan,
pdfpagemode=FullScreen,
}
\titleformat{\chapter}[hang]{\lmodern\it\huge\bfseries\flushright}{\thechapter}{1cm}{}
\titleformat{\section}[hang]{\lmodern\Large\bfseries}{\thesection}{0.5cm}{}
\titleformat{\subsection}[hang]{\lmodern\normalsize\bfseries}{\thesubsection}{0.5cm}{}
\titleformat{\subsubsection}[hang]{\lmodern\small\bfseries}{\thesubsubsection}{0.5cm}{}
\graphicspath{{/home/chris/Dropbox/HOTOSM/figures/}}
\lhead{\parbox[t]{0.4\textwidth}{\RaggedRight\rightmark\strut}}
\rhead{\parbox[t]{0.4\textwidth}{\RaggedLeft\leftmark\strut}}
\setlength{\headheight}{5\baselineskip}
\pagenumbering{roman}
\linespread{1.5}
\renewcommand{\listfigurename}{List of figures}
\renewcommand{\listtablename}{List of tables}
\nocite{*}
\begin{document}
\newgeometry{margin=1in}
\begin{titlepage}
\begin{center}
\vspace{0.25cm}
\large
\textbf{Deep learning for building segmentation using very high resolution open UAV data}
\vspace{0.75cm}
\normalsize
\textbf{CHAN, Yan-Chak Christopher}
\vspace{2cm}
\textbf{Under the supervision of} \\
Prof. Dr. Hannes Taubenböck \textsuperscript{\hyperlink{DLR}{1}} \textsuperscript{\hyperlink{JMWU}{2}} \\
Matthias Weigand \textsuperscript{\hyperlink{DLR}{1}} \\
Emran Alchikh Alnajar \textsuperscript{\hyperlink{HOT}{3}}
\vspace{1cm}
Master thesis submitted for the degree of\\
\textbf{Master of Science (MSc.)}\\
\vspace{0.25cm}
in\\
\vspace{0.25cm}
Applied Earth Observation and Geoanalysis of the Living Environment (EAGLE)
\vspace{0.5cm}
\includegraphics[scale = 0.3]{neuSIEGEL.png}
\vspace{0.5cm}
\normalsize
Faculty of Philosophy\\
Julius-Maximilians-Universität Würzburg\\
\footnotetext[1]{\hypertarget{DLR}{Department of Geo-Risks and Civil Security, German Remote Sensing Data Center (DFD), German Aerospace Center (DLR)}}
\footnotetext[2]{\hypertarget{JMWU}{Department of Remote Sensing, Institute of Geography and Geology, Julius-Maximilians-Universität Würzburg}}
\footnotetext[3]{\hypertarget{HOT}{Humanitarian OpenStreetMap Team}}
\end{center}
\end{titlepage}
\restoregeometry
\newpage
\begin{abstract}
Refugee camps and informal settlements provide accommodation to some of the
most vulnerable populations, with many of them located in Sub-Saharan Africa. Many of these settlements lack up-to-date geoinformation that we take for granted in developed world. Having up-to-date maps on their dimension, spatial layout is important. They are essential tools for assisting administration tasks such as crisis intervention, infrastructure development, and population estimates which encourage economic productivity. In the OpenStreetMap ecosystem, there is a disparity between built-up being digitised in the developed and the developing areas. This data inequality are results of multiple reasons ranging from a lack of commercial interest to knowledge gaps in data contributors and such disparity can be reduced with the help of assisted mapping technology. Very High Resolution remote sensing imagery and Machine Learning based methods can exploit the textural, spectral, and morphological characteristics and are commonly used to extract information from these complex environments. In particular recent advances in Deep Learning based Computer Vision have achieved significant results. This study is connected to a larger initiative to open-source the AI assisted mapping platform in the current Humanitarian OpenStreetMap Team’s ecosystem, to investigate the capabilities of applying Deep Learning for building footprint delineation in refugee camps based on open-data Unmaned Aerial Vehicle (UAV) imagery from partner organisation OpeAerialMap.\\\par
The objective of this study is to test the U-Net and several variations of the architectures’ performance for building footprint segmentation, The performance of the different Deep Learning models on datasets of various complexity were collected. A comparison of the models' responses using class-based accuracy assessments metrics allows detail evaluation into how the different architectures and experiment setup respond to data quality.\\\par
Given the computation and resources constraint of this project, the result suggests that increase in architectural depths corresponds with increase in precision. Models that were initialised on pre-trained weights from ImageNet could reduce recall. Lastly, to our surprise, the transferability of a competition winning network trained on similar resolution but on formal building performs worse than many models trained from scratch.\\\par
This study showcased the ability to use Deep Learning semantic segmentation to perform building footprint delineation in complex humanitarian applications. Having increased access to open-data Very High Resolution UAV imagery from the OpenAerialMap initiative is an advantage to building AI-assisted humanitarian mapping. The study demonstrated a careful and rigourous approach to model evaluation. Yet, the variation of the study results not only emphasised the complexity of Deep Learning based methods, but also indicate the direction for further investigation that would be justifiable when further resources becomes available.\\\par
\end{abstract}
\newpage
\section{Forewords and Acknowledgements}
Service to the less-advantaged! This is the value taught by my parents since I was small, and have since became the main motivator for the completion of this Masterarbeit. I have been extremely fortunate to have been able to carry out such an important work in partnership with the Humanitarian OpenStreetMap Team and the German Aerospace Center (DLR). This project came about as a hope to do something scientifically rigour, yet practically impactful. Through providing a technical, careful study, I, the supervisors, and partners hope that this work can provide direction and minimise future mishaps of the open-sourced AI-assited mapping solution. Working on this project have been a journey of surprises, long nights, new friendships, and adventures, of which I gained bountiful from. Ultimately culminating into the work being accepted by the Proceedings of the Academic Track at State of the Map 2022 (\textit{Chan et al., 2022}). All of this could not have been possible without the generous facilitation from Emran Alchikh Alnajar and the passionate yet pedantic Matthias Weigand who maintained high standards and scientific rigour while motivating me to focus on the light at the end of the tunnel. I would also like to thank my past classmates, collagues, and lecturers, who tolerated and kept patience with me at times of desparation and frustration. Lastly, I must extend my appreciation to my parents Lucia Leung and Savio Chan. Without their support, I would have never had the resources and opportunities to get to where I am now and complete this work.
\newpage
\section{Declaration of Independent Work}
I confirm that all this work is my own, and that I have
\begin{itemize}
\item Clearly referenced/listed all sources as appropriate
\item Referenced and put in inverted commas all quoted text (from books, web, etc)
\item Given the sources of all pictures, data etc. that are not my own
\item Not made any use of the report(s) or essay(s) of any other student(s) either past or present
\item Not sought or used the help of any external professional agencies for the work
\item Acknowledged in appropriate places any help that I have received from others (e.g. fellow students, technicians, statisticians, external sources)
\end{itemize}
\vspace{5cm}
München, 2022.09.27
\newpage
\pagenumbering{arabic}
\tableofcontents
\newpage
\listoffigures
\newpage
\listoftables
\clearpage
\newpage
\pagestyle{plain}
\section{Abbreviations}
\begin{itemize}
\item SDG: Sustainable Development Goals
\item MDG: Millenium Development Goals
\item OSM: OpenStreetMap
\item HOT: Humanitarian OpenStreetMap Team
\item UAV: Unmanned Aerial Vehicle
\item CV: Computer Vision
\item DL: Deep Learning
\item CNN: Convolutional Neural Network
\item SGD: Stochastic Gradient Descent
\item OCC: Open-Cities-AI-Challenge
\item OA: Overall Accuracy
\item IoU: Intersection-over-Union
\end{itemize}
\newpage
\pagestyle{fancy}
\chapter{Introduction}\label{Intro}
The world’s population is more urbanised than ever before. As of 2018, approximately 4 billion (55\%) (UN DESA., 2018, Taubenöck et al., 2009) reside in urban areas, of which 60\% reside in slums often located at the fringes of the city (Venables A., 2018). Urbanisation growth is expected to increase by 2.5 billions between 2018 and 2050, most of which will be in Asia and Africa (UN DESA., 2018). When population growth outpace development, informal settlemnt become the supplier of significant housing stocks. These informal settlements are dynamic and represent a good reflection of cultural practices, access to resources, financial limitations and other socio-economic conditions. This means the informal settlement differs significantly between urban and rural settlements from roof covers, densities, and are subjected to different levels of access to resources and the types of resources. In particular, refugee camps and their development are often dependent on international aid and humanitarian efforts, resuting in unique urban morhopology where parts of older refugee camps resembles the chaotic characteristics of informal settlement while newer extension received careful planning.\\\par
Refugee camps are often the common or only way for displaced people to receive shelters and assistance. They are often setup in place of proximity to displaced population, whether that be from natural disasters, human caused disasters, or other reasons. Throughout history, refugee sites have provided haven to the world's most vulnerable population (UN, 2018, Turner S., 2016, UNHCR, 2021). However, as of 2020, only around 1.4 million out of 26.4 million refugees have access to third country solution between 2016 and 2021 (UNHCR, 2021). Additionally, although officially defined as temporary settlement, many refugee camps have had longer than expected life cycle, some of them have even become "Secondary Cities" and therefore suffer similar problems of poor governance and rapid urbanisation which consequentially makes them unattractive as investment (Cities Alliance \& AfDB., 2022). For the many refugee camps and informal settlements that have lasted well beyond their expected temporary role, there are generally 3 ways of solving the issue: 1. Voluntary repatriation, 2. Reolcation to third country, 3. Local integration as outlined by the Global Compact on Refugees (UN, 2018), although actual implementations are often subjected to the wills of the host soverign-state. Recent studies have suggested that local integration often have a net positive economic impact on the surrounding region (Alix-Garcia et al., 2018, Rummery A., 2019, IFC., 2018). \\\par
Since the 2000, the United Nations have codified a set of global development goals which the member states commited to, development projects are therefore encouraged to align their goals to achieve such global development goals. The United Nations Department of Economic and Social Affairs have published a set of 17 Sustainable Development Goals (herein SDG) to be achieved by 2030 as a successor to the 2015 Millenium Development Goals (herein MDG) (UN, 2015). Special attention are drawn to Goals 1 and 10 that are particularly relevant to this study.\\\par
\begin{itemize}
\item \textit{Goal 1: End poverty in all its forms everywhere}
\begin{itemize}
\item \textit{Target 1.1: By 2030, eradicate extreme poverty for all people everywhere, currently measured as people living on less than \$1.25 a day}
\item \textit{Target 1.4: By 2030, ensure that all men and women, in particular the poor and the vulnerable, have equal rights to economic resources, as well as access to basic services, ownership and control over land and other forms of property, inheritance, natural resources, appropiriate new technology and financial services, including microfinance}
\item \textit{Target 1.b: Create sound policy frameworks at the national, regional and international levels, based on pro-poor and gender-sensitive development strategies, to support accelerated investment in poverty eradication actions}
\end{itemize}
\item \textit{Goal 10: Reduce inequality within and among countries}
\begin{itemize}
\item \textit{Target 10.1: By 2030, empower and promote the social, economic and political inclusion of all, irrespective of age, sex, disability, race, ethnicity, origin, religion or economic or other status}
\item \textit{Target 10.7: Facilitate orderly, safe, regular and responsible migration and mobility of people, including through the implementation of planned and well-managed migration policies}
\end{itemize}
\end{itemize}
Having up-to-date map is therefore paramount for short and long term humanitarian projects, from the delivery of essential medicine, spatial and policy planning, to population estimation, quality and timely maps are essential to improve future decision making in both humanitarian and non-humanitian context. Although we have seen an overall net increase in the amount of buildings mapped on the OpenStreetMap (herein OSM) platform, contribution is still skewed towards developed cities and countries. Patterns of episodic contribution maybe observed post disasters, but the contributed data inequality is not easily reconciled (Herfort et al., 2021) (\textit{see figure \ref{fig:data_inequality}}).\\\par
\begin{figure}[H]
\centering
\includegraphics[scale = 0.5]{data_inequality.png}
\caption{Data contribution of buildings and highway in all and humanitarian settings within OSM (Herfort et al., 2021)}
\label{fig:data_inequality}
\end{figure}
Part of the reason for this could be traced to both the knowledge of volunteer contributors and commercial interest, for detailed discussion, see Anderson et al., 2019, Veselovsky et al., 2021 and Yang et al., 2016. As the topic and data provider of this project, the Humanitarian OpenStreetMap Team (herein HOT) have been at the forefront of using open and crowd sourced mapping data to support humanitarian causes from shorter term disaster response to longer epidemiology and microfinance campaigns (HOT, 2021). HOT would like to make use of the recent advancement of Deep Learning (herein DL) in the geospatial field (e.g. Herfort et al., 2019, Kuffer et al., 2016, Wurm et al., 2021, Quinn et al., 2018) to develop an open-sourced, open-data AI-assisted mapping solution to reduce the geospatial data inequality in the OSM ecosystem. The result of this work will form part of the pilot study to trial the use of Unmaned Aerial Vehicle (herein UAV) imagery and existing labelled data in the segmentation of two major refugee camps of East Africa.\\\par
\newpage
\chapter{Literature Review}\label{LitReview}
\section{Remote Sensing of Informal Settlements}\label{RSofInformalSettlement}
The complexity of human built environments often consist of using very different materials in conjunction to each other in a dense environment. From power-lines, factories, car-park, to leisure-parks, imaging of urban enviornments therefore requires imagery high in both spatial and temporal resolution. Urban remote sensing calls for techniques that extracts geometry, textural, and other physical features as opposed to the more common spectral based index approach used in ecological or enviornmental remote sensing (Jensen J., 2007, NRC., 1998).\\\par
Informal settlement and slum mapping of developing countries require very high resolution (VHR) images which was unavailable until the turn of the century. The relatively new technology thus only began to gain traction within the last 2 decades. Particularly with the increase in the availability of civilian commercial VHR satellites. Increase in computational power had enabled novel techniques such as multi-layer machine learning, textural analysis, and novel geostatistical methods to emerge (Kuffer et al., 2016). The use of remote sensing derived data for socio-economic proxies have been able to compensate for traditional sources such as temporally infrequent census (e.g. Watmough et al., 2012, Watmough et al., 2015, Watmough et al., 2019) Census especially conducted in developing areas also falls short in capturing socio-spatial patterns, potentially overlooking others socio-economic determinants such as access to amenities and infrastrcture. Therefore, adequate pro-poor policy development hinges on the availablity of up-to-date and good quality analysis (Kuffer et al., 2016, Sliuzas et al., 2017). Remote sensing of settlements largely falls under 2 categories, rural or urban. Due to the different make-up of socio-economic context and urban morphology, ssensing of rural and urban settlements require different parameters. Additionally, there’s no “one size fit all” way to generalise informal and formal settlement across the world, as physical geography, topography, cultural, and available resources often dictate the distribution, development, and settlement clusters pattern. These unique requirements have thus made DL techniques particularly useful, and many applications of DL in remote sensing have therefore been in the urban domain (Ma et al., 2019).
\section{Deep Learning in Urban Remote Sensing}\label{DLinRS}
The following section will be divided into 3 parts. The first part reviews the concept of Computer Vision (herein CV) as a study subject, previous common practices, and the evolution towards data-driven Deep Learning. The second part will focus on domain specific review of recent AI-based segmentation practices on building segmentation and particularly the recent practices in informal settlement segmentation. Lastly, the third part will explain the mechanism of the Convolutional Neural Network, the class of neural networks commonly used for CV tasks.\\\par
\subsection{Computer Vision and a brief review of Convolutional Neural Networks}\label{CVinBS}
Computer Vision is a practice of extracting information from digitised imagery, first applied in robotics, it has become an interdisciplinary field of study utilised by medicine, biologist, and remote sensing scientists alike (Rosenfeld A., 1988, Szeliski R., 2010).\\\par
The field of CV traditionally used mathematical operations that are applied on the imagery represented as multi-dimensional arrays. These include point-and-local operations, statistical computation, geometric operations, transformations, and extraction of geometric entities, for detail discussion see Rosenfeld A. (1988). The following decades saw CV operations developed into more specialised applications. For example, edge detectors (e.g. Sobel, Prewitt, Marr-Hildreth) or Grey-Level Co-occurence Matrix kernel (e.g. Haralick Texture) (e.g. Kuffer et al., 2014, Kuffer et al., 2016, Wurm et al., 2017) (Pal \& Pal, 1993, Blaschke T., 2010, Blaschke et al., 2014). CV based segmentation experienced an akin to a Kuhnian paradigm shift (Kuhn T., 1962) when AlexNet, a Convolutional Neural Network trained on a GPU (Krizhevsky et al., 2012) won the preeminent large-scale ImageNet CV challenge (Deng et al., 2009), it reinvigorated the use of multi-layered neural network in CV tasks (LeCun et al., 2015, Bengio et al., 2017). The timing of the paradigm shift coincided with the increase in computation power provided by Graphical Processing Unit (GPU) have enabled CNNs to be successfully applicated across domains ranging from biomedical imaging to remote sensing (Ma et al., 2018, Zhu et al., 2017, Zhang et al., 2016, Wurm et al., 2019).\\\par
In the field of CV, there is generally 4 types of application: 1. Semantic segmentation, 2. Classification and localisation, 3. Object detection, and 4. Instance segmentation (\textit{see figure {\ref{fig:CV_tasks}}}) (Stevens et al., 2020). This study will conduct semantic segmentation of binary classification between built-up and no built-up.\\\par
\begin{figure}[H]
\centering
\includegraphics[scale = 0.25]{CV_tasks.png}
\caption{The four main types of Computer Vision tasks (Stanford University, 2022)}
\label{fig:CV_tasks}
\end{figure}
The purpose of semantic segmentation is to assign a named (semantic) classification to each and every single pixel of the input image (Rosenfeld A., 1988, Szeliski R., 2010). This is commonly applied in remote sensing of Land Use Land Cover classification where every single pixel will be assigned and Land Cover or Land Use type. Another application is binary segmentation where the model will only be trained to assign a named classification to a particular clusters of associated pixels. This is more common in single class segmentation. The difference between mere classication and segmentation is that the semantic segmentation output a mask over the pixel will be created, where each pixel within the mask belongs to the same semantic task; meanwhile, classification only gives a confidence of semantic of the whole scene without assigning the classification to each pixel.\\\par
\subsection{Deep Learning and Convolutional Neural Networks}
The closest resemblance to modern methods first appeared around the 1960s, with the first CNNs appearing around 1980s. However, Deep Learning based methodologies went in and out of popularity driven partly by lack of computaional resources and partly the non-linearity of the field's progression (i.e. the parts did not come together at the right time.). Causing the "AI winter" between 1970s to 1990s (Schmidhuber J., 2014). Prior to the resurgence of popularity in DL, the set of methodology now associated with neural networks was known as a multi-layered perceptron. Initially inspired by a mathematical analogy to codify the function of a single neuron by the seminal Psychological review paper published by Rosenblatt F. (1958). Like a human neuron, The properties of a perceptron on the most fundamental level takes an information/numerical input, stores and apply transformation, and create an output (\textit{see equation \ref{weights&bias}}). Through stacking of basic perceptrons, a multi-layered perceptrons structure can be created. In order for such structure to be computationally useful, it must satisfy the following criteria:\\\par
\begin{figure}[H]
\centering
\includegraphics[scale = 0.5]{NeuronPerceptron.png}
\caption{Schematic analogy diagram between a biological neuron and an artificial perceptron (Fumo D., 2017).}
\label{fig:NeuronPerceptron}
\end{figure}
\begin{equation}
\label{weights&bias}
f(\sum_{i ... n} w_{i} x_{i} + b)
\end{equation}
\begin{itemize}
\item Where:
\begin{itemize}
\item $f =$ Activation function
\item $\sum_{(i ... n)} =$ Summation of i to nth dimension
\item $w_{i} x_{i} =$ Weights multipled by original input variable ($x$)
\item $b =$ bias
\end{itemize}
\end{itemize}
\begin{enumerate}
\item Collections of connected perceptrons are capable of plasticity (i.e. changing values) through training.
\item Perceptrons will form dominant pathways that "fire" (activate) together.
\item Through training, perceptrons will learn to apply positive or negative reinforcement to facilitate minimising error (e.g. assigning and changing "weights").
\end{enumerate}
The particular group of such perceptron structures used in CVs are known as Convolutional Neural Network (herein CNN) The most basics of CNN consist of 3 parts: 1. An input layer, 2. Multiple hidden convolution and pool layers, 3. An output layer which provides with the segmentation result and the associated confidence level (\textit{see figure \ref{fig:NeuronPerceptron} & \ref{fig:ConvNet}}). Therefore, a Deep-Learning Neural Network system is string together by a series of inter-connected layer of which its parameters are adjustable to adapt to the data provided. Through careful iterative training and adjustment, the system can generalise well not only to the training and validation data, but to future datasets as well.\\\par
\begin{figure}[H]
\centering
\includegraphics[scale = 0.5]{ConvNet.png}
\caption{Schematic diagram of a CNN (Stanford University, 2022).}
\label{fig:ConvNet}
\end{figure}
\subsubsection{Convolution and Pooling}\label{Conv&Pool}
The hidden layers of the CNN is where the network performs representation learning, where with each layer in depth learns more abstract features of the input training image. While not often the case, it is conventional practice to interleave the convolutional and the pooling layers (Stevens et al., 2020).\\\par
The convolutional layer essentially treats every image pixel as vector in a 3-Dimensional layout with input of $(Batch\ Size, Channel_{in}, Height, Width)$, the convolutional kernel slides and apply the weighting and bias terms to extract deeper features (\textit{see figure \ref{fig:Conv}}) (Stevens et al., 2020), thus, learning deeper features which creates the output of $(Batch Size, Channel_{out}, Height, Width)$. The full transformation per convolutional layer transform \textit{equation: \ref{weights&bias}} of each pixel into \textit{equation: \texitit{\ref{nn.Conv2d}}}.\\\par
\begin{figure}[H]
\centering
\includegraphics[scale = 0.3]{Conv.png}
\caption{3 x 3 Convolution (Stanford University, 2022).}
\label{fig:Conv}
\end{figure}
\begin{equation}
\label{nn.Conv2d}
out(N_{i}, C_{out j}) = bias(C_{out j}) + \sum_{k=0}^{C_{in}-1} weight(C_{out j}, k) * input(N_{i}, k)
\begin{itemize}
\item Where:
\begin{itemize}
\item $N =$ Batch Size
\item $C =$ Channels
\item $k =$ Kernel Size
\end{itemize}
\end{itemize}
\end{equation}
The pooling layer reduces the dimension by downsampling by applying conventionally, a smaller kernel which extract the desirable value when applied. Maximum Pooling, which only return the largest value in the downsampling kernel is common and is used for the network architecture of this experiment (Stevens et al., 2020). The pooling layer scale down the image while retaining the most crucial information (\textit{see figure \ref{fig:maxpool}}).\\\par
\begin{figure}[H]
\centering
\includegraphics[scale = 0.3]{maxpool.jpeg}
\caption{Max pooling (Stanford University, 2022).}
\label{fig:maxpool}
\end{figure}
\subsubsection{Optimiser and the Binary Cross Entropy Loss function}\label{Optim&BCE}
With each batch of data (aka. a complete pass) is ingested through the neural network, the ouput is then compared against the validation result for error calculation. The summed average of loss defines the cost function landscape from which the error value is calculated against. From which the score is penalised when prediction is incorrect and rewaedd if otherwise. The backpropagation finds the sensitivity of the cost function to each weights and biases through the chain rule. This enables the adjustments of the weights and biases in the proceeding pass. Due to the binary segmentation task for this study, the Binary Cross Entropy loss function was used to measure error \textit{equation: \ref{BCELoss}}.
\begin{equation}
\label{BCELoss}
l(x, y) = L = \{l_{1} ... l_{N}\}^T, l_{n} = -w_{n}[y_{n} * logx_{n} + (1 - y_{n}) * log(1 - x_{n})]
\begin{itemize}
\item Where:
\begin{itemize}
\item $N =$ Batch Size
\item $l(x, y) =$ loss(Probability, Binary Classification)
\item $l_{n} =$ loss at sample $n$
\item *Further details can be found in \href{https://pytorch.org/docs/stable/generated/torch.nn.BCELoss.html#torch.nn.BCELoss}{PyTorch documentation for nn.BCELoss}
\end{itemize}
\end{itemize}
\end{equation}
The optimiser controls the gradient descent, it is the hyperparameter which controls the size of step being taken down the negative gradient of $- \nabla C$ in the cost function landscape. A suitable optimiser can prevent gradient descent to be trapped at a local minimum through iterations. The optimiser of choice for the experiement is the Adam optimiser, The Adam (Adaptive Momentum Estimator) developed by Kingma \& Ba (2017) extends the Stochastic Gradient Descent (herein SGD) by introducing concepts of momentum and second moments of gradient (\textit{see appendix \ref{app:Adam}}). Adam maintains a separate learning rate for each parameter and have become the standard pick of optimiser since.\\\par
\subsubsection{Backpropagation and the chain rule}\label{Backprop&Chain}
In order for a neural network to improve, the weights and biases are changed accordingly to minimise the cost. This is computed as a step-wise function as a negative vector against the cost landscape. For which each parameter of the weights and biases of each neuron within the neural network is defined as a chained function against the cost in \textit{equation \ref{cost_derivative}}. In other words, what is the derivate of the cost function with respect to the chain of weights and biases derivatives.\\\par
\begin{equation}
\label{cost_derivative}
\frac{\delta C}{\delta P^{i}} = \frac{\delta(w^{L}x^{i-1} + b^{i})}{\delta P^{i}} \frac{\sigma[\delta(w^{i}x^{i-1} + b^{i})]}{\delta (w^{i}x^{i-1} + b^{i})} \frac{\delta C}{\sigma[\delta(w^{i}x^{i-1} + b^{i})]}
\end{equation}
\begin{itemize}
\item Where:
\begin{itemize}
\item $\delta C =$ Derivative of Cost Function
\item $\delta P^{i} =$ Derivative of a Parameter, which could be the $w^{i}$ weight or $b^{i}$ bias or for activation function $\delta(w^{i}x^{i-1} + b^{i})$ at layer $i$
\item $x = $ Input Variable
\item $\sigma =$ Activation Function (e.g. ReLU, Sigmoid)
\end{itemize}
\end{itemize}
Where \textit{equation \ref{cost_derivative}} is summed over all parameters of layer $i$ becomes \textit{equation \ref{Sum_cost_derivative}}\\\par
\begin{equation}
\label{Sum_cost_derivative}
\nabla C = \frac{\delta C}{\delta P^{i}} = \sum^{n_{i}-1} \frac{\delta(w^{L}x^{i-1} + b^{i})}{\delta P^{i}} \frac{\sigma[\delta(w^{i}x^{i-1} + b^{i})]}{\delta (w^{i}x^{i-1} + b^{i})} \frac{\delta C}{\sigma[\delta(w^{i}x^{i-1} + b^{i})]}
\end{equation}
Thus, taking the negative gradient $-\nabla C$ will provide the gradient descent step hopefully towards the global minimum.\\\par
Essentially, using CNN and DL methods to perform semantic segmentation through repeated iterations of the above processes have proven to generalise well to complex dataset. Althought the aim of this study is not to produce a deployable model necessarily, the study will lay the foundational groundwork for a data-drive evaluation of different CNNs elaborated in section \ref{Arch&Hyperparam}.\\\par
\newpage
\chapter{Data and Methodologies}\label{DataandMethods}
\section{Study Areas of Interest}\label{AOI}
\begin{figure}[H]
\centering
\includegraphics[scale = 0.3]{AOI.jpg}
\caption{The Kalobeyei and Dzaleka camps respective location in East Africa,}
\label{fig:AOI}
\end{figure}
\subsection{Kalobeyei, Kakuma, Turkana, Kenya}\label{Kalobeyei}
The Kakuma camp was first established in 1992, located in the rural North-West county of Turkana, Kenya. The camp was initially established to provide accomdation to the refugees fleeing the Second Sudanese Civil War as a temporary solution. However, as the conflict dragged out and followed by subsequent conflicts in the nearby region, the Kakuma camp have therefore been running for the past 30 years. As of 2020, Kakuma is home to 157,718 refugees with increasing number coming from the more recent Somali and Ethiopian-Eritrean conflict (IFC., 2018, UN-HABITAT, 2021).\\\par
The Kakuma refugee camp have fluctuated in population as a response to demand, however, a dramatic increase in population between 2013 and 2014 has culminated into the development of Kakuma 4 Camp and the Kalobeyei Settlement and the Kalobeyei Integrated Socio-Economic Development Plan (KISEDP). The settlements benefited from a much better spatial planning in order to facilitate inclusive socio-economic development (UN-HABITAT, 2021, UNHCR \& DANIDA, 2019) (\textit{see figure \ref{fig:KU_KALO_LU}}). Both the Kakuma and Kalobeyei refugee camps have local integration as the targeted solution (UN-HABITAT, 2021, UNHCR \& DANIDA, 2019). A comprehensive study of the formal and informal economy of Kakuma refugee camp conducted by the International Financial Corporation (IFC, 2018) suggests that that market catering for the refugees and surrounding towns is estimated at KES 1.7 billion (USD \$16.4 million). The economical vibrancy of local integration have improved the economy of improverished Turkana county significantly. However, challenges remain in integration into the wider Kenyan economy.\\\par
\begin{figure}[H]
\centering
\includegraphics[scale = 0.1]{Kakuma_Kalo_LU.png}
\caption{The Kakuma-Kalobeyei land use and planning areas (UN-HABITAT, 2018)}
\label{fig:KU_KALO_LU}
\end{figure}
\begin{figure}[H]
\centering
\includegraphics[scale = 0.4]{Kalobeyei_map.png}
\caption{RGB UAV imagery of the Kalobeyei settlements in rural Turkana from OpenAerialMap}
\label{fig:KBY_overview}
\end{figure}
\section{Dzaleka, Dowa, Malawi}\label{Dzaleka}
Originally an infamous prison camp under the Banda's Malawi Congree Party regime, the area was converted to become the Dzaleka Refugee Camp in 1994. Unlike the Kakuma and Kalobeyei camps, the Dzaleka Refugee Camp is located in the heart of Malawi, 45 km away from the capital Lilongwe, where it is home to around 52,000 refugees and receive on average 300 new residents every month. Most coming from the Great Lakes area, in particular, the Democratic Republic of Congo and Burundi. However, resurgence of past conflicts between the Republic of Congo and D.R. Congo have caused an increased of influx in recent years (UNHCR, 2014, Kavalo E., 2016). Much of the infrastructure in the Dzaleka camp remain rudimentary at best, and very little resources and statistics were available via the UNHCR and UNDP portals. The Northern extension to the Dzaleka main camp is known as the Katubza extension (\textit{referred to as Dzaleka North by the rest of this study}), it is a well-planned plot of land consisting of 423 shelter shelters and were still inconstruction as of March 2021 (Gross G., 2021 \& UNHCR, 2021) (\textit{see figure \ref{fig:DZ_KA_PLAN}}). For the rest of this report, reference to the datasets of Kalobeyei, Dzaleka, and Dzaleka North will be denoted as KBY, DZK, and DZKN respectively.\\\par
\begin{figure}[H]
\centering
\includegraphics[scale = 0.4]{dzaleka_topo_malawi.jpg}
\caption{The main Dzaleka Refugee Camp and the Katubza extension plan (Dzaleka North) designed by Urban Design Advisor to the UNHCR Werner Schnellenberg (Gross G., 2021).}
\label{fig:DZ_KA_PLAN}
\end{figure}
\begin{figure}[H]
\centering
\includegraphics[scale = 0.4]{Dzaleka_map.png}
\caption{Digitised rooftop of the Dzaleka and Dzaleka North camps by HOT volunteers}
\label{fig:Overview_DZK}
\end{figure}
Although hosting of refugee camps are often seen as a burden on the surface level, in reality, many refugees are often more educated than the local population, which brings with them entrepreneurial ability and provide the local-area with extra labour force (Alix-Garcia et al., 2018). With constant stakeholder pressure for relocation and closure, showcasing of the refugee camps local economic impact and the potential of, aiding the formation of pro-poor policy development (Cities Alliance, 2022).\\\par
\section{Data}\label{Data}
\subsubsection{Vector pre-processing}
Semantic segmentation tasks require very high quality and quantity of data input in order to successfully perform. The reference dataset must therefore be highly accurate, otherwise this could cause the model to misclassify. There were two significant issues, firstly, due to the centermeter level resolution of the UAV raster data, the abundance of building polygons from OpenStreetMap and \href{https://developers.google.com/earth-engine/datasets/catalog/GOOGLE_Research_open-buildings_v1_polygons}{Google Open Buildings V1 Polygon} did not spatially align well even after reprojection. Secondly, there were a temporal mismatch between such vector labels and the UAV in collection, causing some labels without building and buildings without labels.\\\par
Fortunately, prior to this study, the HOT team and volunteers have begun collecting labels specifically digitised on the UAV imagery of Dzaleka and Dzaleka North camps \textit{see figure \ref{fig:Overview_DZK}}. Although spatially and temporally aligned, these vector labels still did not have a DL CV tasks in mind, hence, labelling around edges of buildings and UAV motion artefacts (Smith et al., 2016, Caravick et al., 2016) \textit{see figure \ref{fig:UAV_motion}} may have been missed.\\\par
With the Kalobeyei camp unlabelled, this study created albeit in less quanitity, a carefully digitised, pixel-aligned dataset which is suitable for DL tasks. The combination of datasets provided gave this study a unique opportunitiy to investigate how the different datasets could influence segmentation results.\\\par
\begin{figure}[H]
\centering
\includegraphics[scale = 0.3]{UAV_motion.png}
\caption{Motion artefacts unique to UAV imagery}
\label{fig:UAV_motion}
\end{figure}
\subsection{Raster pre-processing}
The UAV imagery from OpenAerialMap were first downloaded from OpenAerialMap and resampled to 15 cm resolution using cubic-spline interpolation, subsequently reprojected to EPSG:3857. Normalisation of raster data per colour band (RGB) to adequately re-scale the raster value to be converted to the PNG file format. This is perhaps one of the most important pre-processing step, as normalised images enable easier training and prevents weight explosion (Harrison K., 2020). 2-step normalisation were performed on each band for each UAV imagery to ensure the distribution of value is preserved (Gonzalez \& Woods., 2002).\\\par
First, the z-score normalisation noramlises the images according to the retrieved standard deviation (\textit{see equation \ref{z-score}}). This scales the every pixel to the global statistics for each colour band, keeping proportional ratio while reducing the effects of outlier. The z-score normalised result is then linearly scaled to range of 0 to 255 to be converted to 8-bit .png type file (\textit{see equation \ref{png_norm}}).
\begin{equation}
\label{z-score}
p_{z} = \frac{(p - \mu)}{\sigma}
\end{equation}
\begin{itemize}
\item Where:
\begin{itemize}
\item $p_{z} =$ z-score normalised pixel value
\item $p =$ Original pixel value
\item $\mu =$ Mean value of pixel
\item $\sigma =$ Standard Deviation of pixel
\end{itemize}
\end{itemize}
\begin{equation}
\label{png_norm}
p_{8 bit} = \frac{[p_{z} - min(p_{z})] * 255}{[max(p_{z}) - min(p_{z})]}
\end{equation}
\begin{itemize}
\item Where:
\begin{itemize}
\item $p_{8_bit} =$ Pixel output normalised between 0 and 255
\item $p_{z} =$ z-score normalised pixel value from \ref{z-score}
\end{itemize}
\end{itemize}
After per-band normalisation, the imagery bands were stacked with the associated labels. In order to increase the data quantity, $\frac{2}{3}$ overlapping steps cropping was performed. This resulted in the image label pair count of Train n = 2606, Validation, n = 1303, and Test n = 435 respectively where each set were split at a ratio of 60, 30, and 10 \% respectively. With augemntation applied, this increased the available data to Train n = 18242, Validation n = 3909, and Test n = 435. (\textit{see figure \ref{fig:InRGB} \& see table \ref{table:data_count}})\\\par
\subsection{Data Augmentation}\label{DataAug}
Data augmentation one of the most crucial step in training a robust neural-network and reduce generalisation error (Bengio et al., 2017, Stevens et al., 2020). It is an economical way of increasing generalisability without increasing model complexity, data augmentation achieve this through, firstly increasing the quantity of training and validation data, secondly encompassing a greater range of textural, geometrical, and colour variability throught the creation of augmented pseudo-data (Shorten \& Khoshgoftaar, 2019; Kinsley \& Kukiela, 2020; Howard \& Gugger, 2020; Zoph et al., 2019).\\\par
Data augmentation can generally be split into 3 categories: 1. Geometric/Affine distortion, 2. Colour distortion, and 3. Noise distortion. The application of which types of distortion to the \textit{Train} and \textit{Validation} dataset is highly dependent on the context of the semantic task. Therefore, care must be taken as to not introduce mislabelling (\textit{see figure \ref{fig:MNIST5}}) (Ng A., 2018).\\\par
\textbf{Augmentation categories:}
\begin{itemize}
\item Geometric/Affine distortion
\begin{itemize}
\item e.g. Fliping, Stretching, Rotation...
\end{itemize}
\end{itemize}
\begin{itemize}
\item Colour distortion
\begin{itemize}
\item e.g. Colour Inversion, Solarise Colour, Greyscale...
\end{itemize}
\end{itemize}
\begin{itemize}
\item Noise distortion
\begin{itemize}
\item e.g. Blurring, Contrasting, Salt \& Pepper...
\end{itemize}
\end{itemize}
\begin{figure}[H]
\centering
\includegraphics[scale = 0.4]{MNIST5.png}
\caption{Perhaps geometric augmentation of horizontal flipping shall not be applied on the MNIST number of 5}
\label{fig:MNIST5}
\end{figure}
Thus, the following augmentation were applied to the Train, Validation, and Test datasets respectively:
\begin{itemize}
\item Train - Inverse RGB, Horizontal Flip, Vertical Flip, Gaussian Blur, Contrast Increase, Solarise Colour
\item Validation - Horizontal Flip, Vertical Flip
\item Test - None
\end{itemize}
\begin{figure}[H]
\centering
\includegraphics[scale = 0.8]{InRGB.png}
\caption{An example of Inverse RGB augmentation applied to the Train dataset.}
\label{fig:InRGB}
\end{figure}
\begin{table}[H]
\centering
\resizebox{0.65\textwidth}{!}{\begin{minipage}{\textwidth}
\begin{tabular}{ |p{3cm}|p{3cm}|p{3cm}|p{3cm}| }
\hline
\multicolumn{4}{|c|}{Dataset input with augmentation} \\
\hline
Datasets & Train & Validation & Test \\
\hline
KBY & 5719 & 1224 & 272 \\
KBV + DZK + DZKN & 18242 & 3909 & 435 \\
\hline
\end{tabular}
\end{minipage}}
\label{table:data_count)}
\caption{Resulted image and label pair for each dataset input configuration}
\end{table}
\subsubsection{Pre-trained weights and transfer-learning}\label{pretrained_weights}
Pre-trained networks are neural networks that were already trained and optimised for a particular dataset. Therefore, the models' adjustable parameters have already learnt the basic features such as edges and shades etc. Training built upon a pre-trained network therefore could potentially reduce variation in the training results (Bengio et al., 2017), this process is known as transfer-learning. By comparing architectures initialised randomly and on pre-trained weights, this study investigates the affect of weight-initialised, transfer-training of different datasets.\\\par
Fortunately for recent DL pracitioner, many of the classical and sometimes novel architectures have already been trained on the large-scale CV datasets (e.g. ImageNet, CIFAR) and have been made available to the community. This has made such models which often require large computational resources available to resources constrained projects.\\\par
\section{Research Questions and experiment design}\label{RQ}
To provide HOT with objective, clear results that could provide evidence-based direction for future research, clear guiding research questions and careful experiment desgins were required.
In order to train a model which performs well on UAV imagery, the motion artefact was a signficant feature for the models to learn. The combination of data availability have allowed a unique set of research questions concerning the input data quality and experiment setup to surface.\\\par
Additionally, using pre-trained weights as a strategy is well-documented in literature to improve performance across many domains (Stevens et al., 2020, Howard \& Gugger, 2020). Numerous studies have showcased the success of cross-domain transfer training from classical CV datasets to remote sensing tasks (e.g. Audebert et al., 2017, Marmanis et al., 2016), therefore one would expect the transfer training of CNN pre-trained on any dataset would provide it with an advantage. Thus, it is important that this study also test the effects of the CNNs response when initialised with weights from ImageNet and the OCC building segmentation model.\\\par
\begin{enumerate}
\item RQ1: Do state-of-the-art models allow for accurate detection of buildings from UAV data in refugee camps?
\item RQ2: What is the optimal mixture of accurate and less-accurate labels and how does that affect the segmentation output result?
\begin{enumerate}
\item How does the introduction of complex environment such as heterogeneous urban morphologies, roofing materials, and UAV drone artefacts affect result?
\end{enumerate}
\item RQ3: How do existing models pre-trained on classical CV datasets' and/or building datasets' response when applied to the setting of refugee camps?
\end{enumerate}
The selection of models will initially be trained on the pixel-perfect and less complex Kalobeyei dataset, this will be then be followed by introducing the Dzaleka datasets of higher complexity. \textit{Figure \ref{fig:rooftops}} show a snapshot of the diverse rooftops to be segmented in the available datasets. A comparison of performance between the U-Net variations (Ronneberger et al., 2015) and the \href{https://github.com/drivendataorg/open-cities-ai-challenge/tree/master/1st\%20Place}{Open-Cities-AI-Challenge (herein OCC) winning model} is conducted (\textit{see table \ref{table:setup}}).\\\par
\begin{figure}[H]
\centering
\includegraphics[scale = 0.3]{batch_32.png}
\caption{Collections of diverse and heterogeneous rooftops from the Kalobeyei, Dzaleka, and Dzaleka North datasets.}
\label{fig:rooftops}
\end{figure}
\section{Architecture and hyperparameter selection}\label{Arch&Hyperparam}
Model architecture and their associated hyperparameters selection is highly dependent on the computational resources and the task at hand (Ng A., 2018, Howard \& Gugger, 2020). As this study aims to output a pixel-based binary segmentation which delineates building and non-building, and given the computational resources' constraint, model selection were based on tried and tested architectures with relatively low number of training parameters.\\\par
\subsection{The U-Net and U-Net variants}\label{Unet}
The U-Net architecture was first developed by Ronneberger et al. (2015) for the task of cell segmentation in biomedical electronmicroscope images. The architecture feature a symmetrical Encoder-Decoder structure (\textit{see figure \ref{fig:U-Net}}) and as with many other CNN, the architecture have transferred successfully well into the remote sensing domains (Höser \& Künzer, 2020, Höser et al., 2020, Xu et al., 2019). This symmetrical encoder-decoder type architecture with concatenated skipped-connections is able to extract deeper features in the encoder layers, then recover and interpolate spatial features in the connected unsampling decoder layers (Wurm et al., 2019).\\\par
\begin{figure}[H]
\centering
\includegraphics[scale = 0.75]{Seale_U-Net.jpg}
\caption{The Encoder-Decoder U-Net architecture (Ronneberger et al., 2015, Seale et al., 2022)}
\label{fig:U-Net}
\end{figure}
\subsubsection{Changing the encoder architecture and the EfficientNet family}\label{EffNet}
The ability to switch out the encoder structure allows the DL practioner to experiment with more up-to-date architectures without changing the output shape. This drastically increases the combination of experiments that allow testing the best combinations of encoder-decoder structure suitable for the dataset. All experiments in this study were carried out using the high-level PyTorch API \href{https://segmentation-models-pytorch.readthedocs.io/en/latest/quickstart.html}{Segmentation-Model-PyTorch} created by Yakubovyskiyl P. (2021). Who was also the winner of the OCC challenge for UAV building segmentation. This study will compare and contrast the unchanged 4-layer and 5-layer U-Nets architectures with the U-Nets with changed encoders. The changed encoders were based on the EfficientNet family. There are three reasons for this decision. Firstly, at one of the last stage of the OCC compeition winning network, the EfficeintNet B1 was used as an encoder. Secondly, the EfficientNet family are a set of network architectures that are structured and easy to scale up when computational resources become available. Thirdly, they are perhaps the best represntation of generalised state-of-the-art architectures that have been tested and performed well in classical CV datasets (\textit{see figure \ref{fig:Eff_perform}}) (Tan \& Le, 2020). In essence, these are sets of experiments that mix and match old and new architectural design.\\\par
\begin{figure}[H]
\centering
\includegraphics[scale = 0.5]{Eff_perform.png}
\caption{EfficientNet family Top 1\% Accuracy Assessment in ImageNet (Tan \& Le, 2020).}
\label{fig:Eff_perform}
\end{figure}
The EfficientNet family uses compound scaling which increases the height, width, and depth. The baseline architecture was generated using AutoML Neural Architecture Search (Elsken et al., 2019) which optimised for computational efficiency and accuracy (\textit{see appendix \ref{app:EfficientNet}}). Therefore, the unweighted Four-layer U-Net, Five-layer U-Net, and the OCC winner weighted EfficientNet B1 U-Net are the key architectures the rest will compare against.\\\par
\begin{table}[H]
\centering
\resizebox{0.5\textwidth}{!}{\begin{minipage}{\textwidth}
\begin{tabular}{ |p{3cm}|p{3cm}|p{3cm}|p{3cm}|p{3cm}| }
\hline
\multicolumn{5}{|c|}{Trained Architecture Specification Table} \\
\hline
Encoder & Decoder & Initalised weights & Trainable parameters & Batch-size (8 GB GeForce GTX 1070Ti) \\
\hline
4-layer U-Net Encoder & 4-layer U-Net Decoder & None & 776,3041& 32\\
5-layer U-Net Encoder & 5-layer U-Net Decoder & None & 3110,0513 & 32\\
EfficientNet-B1 & 4-layer U-Net Decoder & None & 700,5041& 32\\
EfficientNet-B1 & 4-layer U-Net Decoder & ImageNet & 700,5041& 32\\
EfficientNet-B1 & 5-layer U-Net Decoder & OCC & 875,7105& 16\\
EfficientNet-B2 & 4-layer U-Net Decoder & None & 821,1283& 32\\
EfficientNet-B2 & 4-layer U-Net Decoder & ImageNet & 821,1283& 32\\
\hline
\end{tabular}
\end{minipage}}
\label{table:setup}
\caption{The U-Nets and the variations thereof selected for this study}
\end{table}
\section{Hyperparameters and baseline model perforamce}
The hyperparameters of a neural network are changable parameters which control the training process (Bengio et al., 2017, Stevens et al., 2020, Howard \& Gugger, 2020). They include the batch size, optimiser, learning rate, weight decay, loss function, and learning rate scheduler etc. (\textit{see table \ref{table:hyperparameters}}). One of the most difficult processes in DL is finding the correct hyperparameters value that cause the model to neither overfit nor underfit the dataset. The strategies and options are often overwhelming, therefore this study does not concern itself with changing or tuning the hyperparameters for all the models but rather withholding them from changes so that a comprehensive controlled experiment can be performed. This allows for the \textbf{baseline} performance of each architecture setup (\textit{see table \ref{table:setup}}) to be identified. This will provide a clear picture of eachs' feasibility, uncover challenges and potentials, and insight into where further resources could be justified to scale future experiments.\\\par
\begin{landscape}
\begin{table}[H]
\hspace*{-5cm}
\centering
\resizebox{0.7\textwidth}{!}{\begin{minipage}{\textwidth}
\begin{tabular}{ |p{5cm}|p{5cm}|p{10cm}| }
\hline
\multicolumn{3}{|c|}{Unchanged hyperparameters for the study experiments} \\
\hline
Hyperparameter & Value to be held constant & Description\\
\hline
Batch size & 32 (16) & Amount of image and label pair shown to the CNN per iteration until the dataset is exhausted, standardised to batch size of 32 other than for the architecture of \textit{EfficientNet B1 U-Net OCC} with a 5-layer U-Net decoder \\
\hline
Optimiser & Adam \textit{see appendix \ref{app:Adam}} & Adaptive Momentum Estimator developed by Kingma \& Ba. (2017) \\
\hline
Loss function & Binary Cross Entropy & ... \textit{see equation \ref{BCELoss}}\\
\hline
Learning rate & 1e-3 & Size of step to be taken down the negative gradient of the cost function landscape\\
\hline
Weight decay & 1e-5 & aka. L2 regularisation, is the sum of all weight squared added to the Loss function. This limits the weights from growing too much, making the loss function easier to fit\\
\hline
Training epochs & 500 & Number of complete cycles through either the training or validation dataset at designated batch size\\
\hline
Validation rate & 10 epochs & Validation data are loaded every 10 epochs to monitor performance\\
\hline
Learning rate scheduler & Reduce Learning Rate on Plateau [factor (0.1), minimum (1e-8), epoch (20)] & A learning rate reduction when learning stagnates and stop improving for 20 epochs\\
\hline
\end{tabular}
\end{minipage}}
\label{table:hyperparameters}
\caption{The hyperparameters and respective values to be held constant for every experiment in this study.}
\end{table}
\end{landscape}
\section{Accuracy Assessment}\label{AccAss}
Detail and scrutable accuracy assessments are fundamental towards any ML based analysis. This section will introduce and break down the various lower order and higher order class-based (thematic) accuracy assessment. By explaining the characteristics of each metric, this will provide a much more granular nature of accuracy assessment in the findings of section \ref{findings}. In general, accuracy assessment in remote sensing can be divided into 2 categories: 1. Positional Accuracy \& 2. Thematic Accuracy. Of which, Thematic Accuracy deals with the labels or attributes accuracy (Congalton \& Green, 2019 \& Bolstad P., 2019) which will be the focus of assessment. Here the study differentiate between 2 groups of class-based accuracy assessments. With binary classification metrics being more granular, they focus on assessing relevant or irrelevant classifications. Meanwhile, the statistical metrics are more triturated but generalised, they are often a statistical combination of the binary classification metrics.\\\par
The metrics described in this section form part of the larger family of accuracy assessment metrics that can be constructed from the confusion matrix (\textit{see figure \ref{fig:cmatrix}})\\\par
\begin{figure}[H]
\centering
\includegraphics[scale = 0.6]{Confusion_matrix.png}
\caption{The Confusion Matrix}
\label{fig:cmatrix}
\end{figure}
\subsection{Binary classification metrics}\label{1storder}
The building blocks of any binary classification and statistical analysis metrics described in the following section are based upon the counting of segmented pixels. The segmentation output will cover the whole imagery with each pixel assigned to be \textit{True Positive}, \textit{False Positive}, \textit{True Negative}, or \textit{False Negative}. A schematic theoretical example is described below:\\\par
\begin{figure}[H]
\centering
\includegraphics[scale = 0.75]{grids.png}
\caption{Examples of theoretical binary building classification.}
\label{fig:grids}
\end{figure}
\newpage
\textbf{Precision} and \textbf{Recall}, aka. Positivie-Predictive-Value and Sensitivity/True-Positive-Rate respectively. The two metrics are often used together, another common denomination especially in remote sensing literature are User's Accuracy and Producer's Accuracy (Congalton \& Green, 2019 \& Wegmann et al., 2016). To avoid further confusion in nomenclature, \textbf{Precision} and \textbf{Recall} will be used from hereon.\\\par
\textbf{Precision} is the measure of correctly predicted Positive class $True\ Positive$ against all positive prediction assigned to that class $True\ Positive + False\ Positive$ i.e. Given the predicted results, of those that are predicted as positive, what proportion were True. It can be expressed mathematically as:
\begin{equation}
Precision = \frac{True\ Positive} {(True\ Positive + False\ Positive)}
\end{equation}
Meanwhile, \textbf{Recall} measures the correctly predicted Positive class $True\ Positive$ against both the correct and incorrect predicton on the Positive reference class $True\ Positive + False\ Negative$ i.e. Given the predicted results, of those that are referenced as positive, what proportion of those were True. It can be expressed mathematically as:
\begin{equation}
Recall = \frac{True\ Positive} {(True\ Positive + False\ Negative)}
\end{equation}
\textbf{Specificity}, aka. True-Negative-Rate measures correctly predicted Negative class $True\ Negative$ against the correct and incorrect prediction on the Negative reference class $False\ Positive + True\ Negative$ i.e. Given the predicted results, of those that are referenced as negative, what proportion of those were True. It can be expressed mathematically as:
\begin{equation}
Specificity = \frac{True\ Negative} {(False\ Positive + True\ Negative)}
\end{equation}
Therefore, higher \textbf{Recall} suggests the model is better at identifying positives and vice-versa higher \textbf{Specificity} suggests the model is better at identifying negatives. Since this is an exercise that aim to maximise the positive prediction as a binary building segmentation classifier, emphasise will be placed on maximising \textbf{Precision} and \textbf{Recall}.\\\par
\subsection{Statistical analysis metrics}\label{2ndorder}
The following are statistical accuracy assessment metrics, where they often encompass the binary classification accuracy assessment metrics of the above section, Thus, although the statistical analysis metrics provide a more generalised overview, they often omit detail responses only available with simpler metrics. Hence, such metrics are often employed as a means to evaluate and rank DL based CV challenges and competitions (e.g. Kaggle Challenges), but they are less so effective in the detailed assessment of segmentation results.\\\par
\begin{equation}
Overall\ Accuracy = \frac{TP + TN}{TP + TN + FP + FN}
\end{equation}
The \textbf{Overall Accuracy} (herein OA) gives an easy to implement, general but an aggregated answer of how well classification is doing which omit the details. The metrics suffers when inbalance count of multiple-classes are involved.\\\par
\begin{equation}
Dice\ Score = 2 * \frac{Precision * Recall}{(Precision + Recall)}
\end{equation}
The \textbf{Dice Score} aka. the F1 score calculates the harmonic mean of \textbf{Precision} and \textbf{Recall}, with contribution for both to be of balanced weightings, the Dice Score could be skewed by classification results with higher performance in either Precision or Recall. Additionally, the metrics does not take $True\ Negative$ values into account if such statistics might be of interest.\\\par
\subsubsection{Intersection-over-Union}\label{IoU}
\begin{equation}
IoU = \frac{A \cap B}{A \cup B} = \frac{True\ Positive}{True\ Positive + False\ Positive + False\ Negative}
\end{equation}
\begin{figure}[H]
\centering
\includegraphics[scale = 0.75]{IoU.png}
\caption{Schematic diagram of Intersection-over-Union}
\label{fig:IoU}
\end{figure}
One of the most commonly used metrics as an assessment in Deep Learning Computer Vision competiton. The Intersection-over-Union (IoU) aka. Jaccard Index is a geometric based accuraccy assessment. The metrics calculate geometrically the area in common between the prediced and actual labels, quantifying the similarities between the two sets. It is mathematically very similar and positively correlates with the \textbf{Dice Score} but places emphasis to the false classification. The IoU is a metrics easy to conceptualise and compare against other ML results due to its established prevalence, a good metrics for comparison against the OCC competition results.\\\par
\subsection{Project workflow}\label{ProjWorkflow}
\begin{figure}[H]
\centering
\includegraphics[scale = 0.5]{ETL_flowFINAL.png}
\caption{Project workflow}
\label{fig:ETL_flow}
\end{figure}
The workflow for this study consist of 5 main stages: 1. Download and Extraction, 2. Data pre-processing, 3. Data processing for loading, 4. Iterative Model training, 5. Inference and Evaluation (\textit{see figures \ref{fig:ETL_flow} \& \ref{fig:simp_ETL}}).
\begin{figure}[H]
\centering
\includegraphics[scale = 0.25]{simp_ETL.png}
\caption{Simplified 5 steps project workflow with reference to \ref{fig:ETL_flow}}
\label{fig:simp_ETL}
\end{figure}
\newpage
\chapter{Findings and Discussion}\label{find&discuss}
\section{Findings}\label{Findings}
A total of 5 different architectures with either no initalised weights or initalised weights pre-trained from ImageNet or OCC building segmentation models have been trained. For each experiment setup, there were 2 sets of dataset input (KBY and KBY + DZK + DZKN), for details (\textit{see table \ref{table:setup}}). Producing 16 sets of trained CNN and associated class-based accuracy assessments (\textit{see figure \ref{fig:Cat_CAA}}), for the experimental results where the plot and the following analysis are derived from, (\textit{see Appendix \ref{app:mean_data}}). On the whole, there were both expected and unexpected results. A reduction in every metric were observed in every single experiment setup when the more complex Dzaleka camps datasets were introduced (\textit{see table \ref{table:data-wise_change}}). This was expected as it is more difficult to train the CNNs on the highly heterogeneous rooftops with similar texture to the surrounding environments. The \textit{Precision} and \textit{Recall} metrics did not vary too much between the architectures when trained only on the Kalobeyei dataset, with the exception to the transfered-untrained \textit{EfficientNet B1 U-Net OCC} model. In contrast, the performance in differences is a lot more visible with the introduction of the Dzaleka and Dzaleka North datasets.\\\par
\begin{figure}[H]
\centering
\includegraphics[scale = 0.35]{256KBYEB1-UNet-OCCUNTRAINEDBASE.png}
\includegraphics[scale = 0.35]{256KBYFive-UnetBASE.png}
\includegraphics[scale = 0.35]{256ALLEB1-UNet-OCCBASE.png}
\includegraphics[scale = 0.35]{256ALLEB2-UNet-NoIMN.png}
\caption{Sample of binary segmentation output of various combinations of tested architecture and experiment setup.}
\label{fig:output}
\end{figure}
\newpage
\begin{landscape}
\begin{figure}[H]
\centering
\includegraphics[scale = 0.4]{Cat_CAA.png}
\caption{Class-based Accuracy Assesment metrics for respective CNN architectures and experiment input dataset.}
\label{fig:Cat_CAA}
\end{figure}
\end{landscape}
\newpage
When comparing the results trained on the Kalobeyei dataset (KBY), the differences between the unchanged architectures (\textit{4-layers \& 5-layers U-Net}) and the architectures with novel EfficientNet encoders (\textit{EfficientNet B1 \& B2}), the novel encoders achieve better performance on the metrics of \textit{Precision}, \textit{Dice Score}, and \textit{IoU}. With the exception to \textit{EfficientNet B1 U-Net} initialised on the OCC weights, the \textit{Efficient B1} encoder performs better in these metrics than the \textit{EfficientNet B2} encoder regardless of initialised weights. Meanwhile, there were negligible differences between the other metrics. This suggests that for \textit{RQ1 \& 2(a)}, with the introduction of the Dzaleka and Dzaleka North (DZK + DZKN) datasets, all EfficientNet encoders suffer from larger performance loss across all accuracy assessment metrics when compared against the unchanged U-Nets. A conclusion for \textit{RQ1} could be drawn that the novel U-Net architectures with encoders of EfficientNet of B1 and B2 could perform better, but only with accuratly labelled data and homogeneous roofs and urban morphology. While the unchanged U-Nets could be more robust when dealing with more complicated, older refugee camps. Further details will be addressed in the discussion sections of \ref{depth_change} \& \ref{data_change}.\\\par
An interesting observation is that with the \textit{EfficientNet B1 U-Net} and the symmetrical \textit{4-layer \& 5-layer U-Net}, both \textit{Precision} and \textit{Recall} increased when initialised on pre-trained ImageNet weights (\textit{see table \ref{table:weight-wise_change}}), with the exception to \textit{EfficientNet B2 U-Net} where a decrease in \textit{Precision} but an increase in \textit{Recall} was observed when compared to the network with non-initalised weights. This suggests that the increase in depth does not necessarily correspond to either an increase or decrease in performance in any particular direction. For the non-weight initialised \textit{4-layer to 5-layer U-Net}, there is a consistent albeit slight improvements in \textit{Recall} but not \textit{Precision}. This contrast the changes in Depth-wise changes discussed in section \ref{depth_change}, where increase in depth shows improvement mostly in the \textit{Precision} but not \textit{Recall} \textit{see table \ref{table:depth-wise_change}}.\\\par
\section{Discission}\label{Discussion}
\section{Depth-wise Precision and Recall change}\label{depth_change}
\begin{table}[H]
\centering
\resizebox{0.55\textwidth}{!}{\begin{minipage}{\textwidth}
\begin{tabular}{ |p{3cm}|p{3cm}|p{3cm}|p{3cm}|p{3cm}| }
\hline
\multicolumn{5}{|c|}{Depth-wise Precision and Recall change} \\
\hline
Architecture & Initialised weights & Input dataset & Precision change & Recall change \\
\hline
4 to 5 layer U-Net & None & KBY & +0.011 & -0.003\\
4 to 5 layer U-Net & None & KBY + DZK + DZKN & +0.007 & -0.002\\
EfficientNet B1 to B2 U-Net & None & KBY & -0.003 & -0.012\\
EfficientNet B1 to B2 U-Net & ImageNet & KBY & +0.003 & -0.006\\
\textbf{EfficientNet B1 to B2 U-Net} & \textbf{None} & \textbf{KBY + DZK + DZKN} & \textbf{+0.023} & \textbf{+0.006}\\
EfficientNet B1 to B2 U-Net & ImageNet & KBY + DZK + DZKN & -0.006 & -0.002\\
\hline
\end{tabular}
\end{minipage}}
\label{table:depth-wise_change}
\caption{Changes with architectures that had a depth-wise increased for each setup.}
\end{table}
\begin{figure}[H]
\centering
\includegraphics[scale = 0.6]{depth_wise_regplot.png}
\caption{Regression plot for \textit{Precision} and \textit{Recall} change in relation to architectural depth-wise change.}
\label{fig:depth_regplot}
\end{figure}
There is a common misconception in the DL realm that deeper networks would always perform better. The comparison in \textit{table \ref{table:depth-wise_change}} in a limited scope tries to address this question, the result suggests that improvements in both \textit{Precision} and \textit{Recall} only happened in non-weight initialised depth increase from \textit{EfficientNet B1 to B2 U-Net} trained on all datasets, achieving both the highest rate of increase in both metrics. In comparison to the same architecture change and datset input with initalised weights from ImageNet, both metrics experienced a decrease. Meanwhile, in other experiment setup, no significant trends can be drawn and thus the assumption does not hold. Although results might differ drastically with increase of dataset, increase in batch size, and when experimenting with much deeper architectures which was unfortunately not available to this study due to computation constraints.\\\par
\section{Dataset-wise Precision and Recall change}\label{data_change}
\begin{table}[H]
\centering
\resizebox{0.65\textwidth}{!}{\begin{minipage}{\textwidth}
\begin{tabular}{ |p{3cm}|p{3cm}|p{3cm}|p{3cm}| }
\hline
\multicolumn{4}{|c|}{Dataset-wise (KBY to KBY + DZK + DZKN) Precision and Recall change} \\
\hline
Architecture & Initialised weights & Precision change & Recall change \\
\hline
4-layer U-Net & None & -0.065 & -0.016\\
\textbf{5 layer U-Net} & \textbf{None} & \textbf{-0.07} & \textbf{-0.0153} \\
EfficientNet B1 U-Net & None & -0.124 & -0.044\\
EfficientNet B1 U-Net & ImageNet & -0.119 & -0.032\\
EfficientNet B1 U-Net (OCC) & OCC & \textbf{-0.002} & -0.387 \\
EfficientNet B1 U-Net (OCC) & OCC transfer-trained & -0.125 & -0.043 \\
EfficientNet B2 U-Net & None & -0.099 & -0.026\\
EfficientNet B2 U-Net & ImageNet & -0.128 & -0.028\\
\hline
\end{tabular}
\end{minipage}}
\label{table:data-wise_change}
\caption{Changes when the Dzaleka and Dzaleka North datasets were introduced to each setup.}
\end{table}
\begin{figure}[H]
\centering
\includegraphics[scale = 0.45]{dataset_wise_stripplot.png}
\caption{Detailed strip plot for \textit{Precision} and \textit{Recall} change in relation to dataset input change.}
\label{fig:data_stripplot}
\end{figure}
With the Kalobeyei dataset as constant, the introduction of the Dzaleka and the Dzaleka North datasets have resulted in the reduction in \textit{Precision} and \textit{Recall} for all architectures with or without initalised weights. Table \ref{table:data-wise_change} might suggest that the least reduction in \textit{Precision} came from the \textit{EfficientNet B1 U-Net} initalised on OCC building segmentation weights. However, figures \ref{fig:output} and \ref{fig:Cat_CAA} informs that with such poor \textit{Dice Score} and \textit{IoU}, it reflects that the OCC competition winning network is being either very confident at the segmentation or completing missing the other buildings. Thus, the \textit{Precision} diverge from the \textit{Recall} results and the statistics (\textit{see figure \ref{fig:data_stripplot}}) suggests prediction results with high $False\ Negative$ and therefore does not reflect overall performance. The \textit{5-layer U-Net} had a much more corresponding result between the \textit{Precision} and \textit{Recall} which shows the metrics are least affected by the introduction of the Dzaleka camps datasets. However, it does not seem to be the case that deeper or shallower version of the architectures cause more or less reduction, therefore it is difficult to draw any conclusion regarding \textit{RQ2(a)}.\\\par
\section{Initialised weight Precision and Recall change}\label{weight_change}
\begin{table}[H]
\centering
\resizebox{0.55\textwidth}{!}{\begin{minipage}{\textwidth}
\begin{tabular}{ |p{3cm}|p{3cm}|p{3cm}|p{3cm}|p{3cm}| }
\hline
\multicolumn{5}{|c|}{Pre-initalised weights Precision and Recall change} \\
\hline
Architecture & Weights changed & Dataset input & Precision change & Recall change \\
\hline
EfficientNet B1 U-Net & None to ImageNet & KBY & +0.003 & +0.008\\
\textbf{EfficientNet B1 U-Net} & \textbf{None to ImageNet} & \textbf{KBY + DZK + DZKN} & \textbf{+0.009} & \textbf{+0.0197}\\
EfficientNet B1 U-Net (5-layer) & OCC to OCC transfer-trained & KBY & 0 & +0.306\\
EfficientNet B1 U-Net (5-layer) & OCC to OCC transfer-trained & KBY + DZK + DZKN & -0.122 & \textbf{+0.649}\\
EfficientNet B2 U-Net & None to ImageNet & KBY & +0.009 & +0.014\\
EfficientNet B2 U-Net & None to ImmageNet & KBY + DZK + DZKN & -0.02 & +0.012\\
\hline
\end{tabular}
\end{minipage}}
\label{table:weight-wise_change}
\caption{Initalised weight change in available CNNs and their effects on the metrics}
\end{table}
\begin{figure}[H]
\centering
\includegraphics[scale = 0.5]{weight_wise_regplot.png}
\caption{Regression plot for \textit{Precision} and \textit{Recall} change in relation to architectures' initalised weight change.}
\label{fig:weight_regplot}
\end{figure}
In section \ref{RQ}, we hypothesised that CNNs trained on pre-initalised weights might have significant advantages in performance. Therefore, one might expect that although the OCC model suffers from low \textit{Recall} and high $False\ Negativity$ in it's segmentation output, perhaps further training on the weighted network would result in drastic improvement. Table \ref{table:weight-wise_change} indicates that this was indeed the result, with \textit{EfficientNet B1 U-Net OCC to OCC transfer-trained} achieving the highest \textit{Recall} change, it however also caused the highest decrease in \textit{Precision}, this trend is less sigificant in the version only trained in the Kalobeyei dataset, but the result reflects the assumption. This suggest that transfer training from the OCC model compensated for the \textit{False Negative} issue, the improvement in \textit{True Positive} segmentation is not as significant. Meanwhile, the \textit{EfficientNet B1 U-Net} initalised from ImageNet weights saw both improvement in the metrics but not the \textit{EfficientNet B2 U-Net}. The result is inconclusive therefore to address the \textit{RQ3}\\\par
Few assumptions could be drawn from the above results. Firstly, depth increase in architecture tend to favour improvement in \textit{Precision} meaning that deeper networks would reduce the classification of $False\ Positive$. Meanwhile, ImageNet initalised weights in most setting tend to favour imrpovements in \textit{Recall} which indicates a reduction in $False\ Negative$. These are useful generalisation for future experiment and especially for when tuning any architectures. In general, introducion of the complex Dzaleka camps datasets cause all architectures to perform worse, but no significant conclusion can be drawn whether the introduction of deeper version of the architecture or initalised weights in tandem with particular dataset introduced would reduce the rate of \textit{Precision} or \textit{Recall} decrease. Finally, a point of contention remains in the human error in labelling. Particularly for the Dzaleka dataset, there are many instances that could cause a contribution to a $False\ Positive$ in the prediction output, which in reality were $True\ Positive$ \textit{see figure \ref{fig:ambiguity}}. These ambiguitities often arises at the courtyard of a particular building or when multiple buildings are interconnected, which is especially rampant in the complex Dzaleka camp. This might suggest that networks trained on all datasets (KBY + DZK + DZKN) might be achieving higher $Precision$ than actually displayed.\\\par
\begin{figure}[H]
\centering
\includegraphics[scale = 0.45]{FP_clothingline.png}
\caption{Ambiguity which arise from labelling could cause a $True\ Positive$ prediction to be classified as $False\ Positive$.}
\label{fig:ambiguity}
\end{figure}
A few shortcomings of this study should also be highlighted. While the standardisation of the batch-size should be in consistency of 32 image and label pairs, the computational constraint of an 8 GB GPU have dicatated that only 16 pairs per batch could be fitted for the \textit{EfficientNet B1 U-Net (OCC)} model. A standardisation of batch-size would have been ideal. However, the prolonged training time required for 16 pairs batch-size is around 3 complete days, thus, a compromise was made for 32 pairs. Another shortcoming was that none of the classical 4-layer and 5-layer U-Nets were tested on pre-trained weights from ImageNet, additional studies will be useful to compare and contrast such changes with the EfficientNet encoders U-Nets pre-trained on ImageNet weights.\\\par
\newpage
\chapter{Conclusion}\label{Conclude}
The beginning of a DL project is a momentous task. Errors from the data pre-processing to architecture selection could be costly in both time and resources, especially in the setting of humanitarian NGOs (Private Communication, 2022). It is important for a pilot project to therefore uncover the possibilities and challenges with well-designed, small-scale yet rigourous experiments. This study presented a series of experiments which tested variations of the U-Nets on the shallow-end of the spectrum, it also explored the possibility of transfer-training from a previous competition winning neural network and how they compared against networks trained from scratch and/or pre-trained in large-scale CV dataset. Initially, there was an assumption that transfer-training from pre-trained networks and deeper architectures would perform better. However, results shows that the overall picture were a lot more complicated, where the models may experience drastic improvement in some metrics, but not others. Furthermore, focusing on the $True\ Positive$ centric class-based accuracy assessment metrics have shown that even performance in \textit{Precision \& Recall} does not necessarily corroborate with each other and that there's nuance relationship between their improvement, architectural depths, and pre-trained weights.\\\par
Albeit many ambiguitiy and questions remain, there are several key takeaways that have surfaced as a result. First, transferring a competition winning network which scored very well in the \textit{IoU} metric does not necessarily guarantee easy transfer and good performance in new environments. This study hinted that the OCC model may have become too customised for its original dataset of mainly formal urban structures. Secondly, it is worthwhile to carefully and critically evaluate the segmentation output by examining more granular binary classification metrics. Thirdly, rudimentary conclusion can be drawn from the depth-wise and weight-wise changes. Unfortunately, the results for how depth and weight change influence performance when the complex Dzaleka camps datasets were introduced remained inconclusive. Thus, it is important for future studies to address this. However, the results indicate that this might scale differently and at a more predictable way when higher batch-size, much deeper network, and when significantly more data can be provided.\\\par
To narrow down the choices, further investigations will be needed. Scaling from the experimental framework defined for this study could be a good strategy for succession. This will allow for a well-constrainted extension of this study. Additionally, baseline experiments with different hyperparameters, where the parameters of isolation are the initialised weights and architectures would be very interesting. Nevertheless, scaling will only be possible with significantly more computational and data resources.\\\par
In conclusion, this study provided a pilot diagnostic understanding of how HOT could begin their open-sourced AI-assisted mapping initiative. The study defined a rigourous experiment and assessment methods of which are scalable. With increasing data availability from the OpenAerialMap initiative, this study hope to have made a foundational contribution to a beginning of a much larger study. Future development should focus on expanding this study in a pedantic and controlled fashion. It will be interesting for further research to focus on the inconclusive RQ3 of this study, which concerns the initlaised weights and their influence on the performance on respective architectures, how does scaling of the network size change such influencs. This will allow for a data based approach to justify resources for scaling experiemnts and future AI based products.\\\par
\newpage
\printbibliography[
heading=bibintoc,
title={Bibliography}
]
\newpage
\chapter{Appendix}\label{Appen}
\subsection{Adam optimiser}\label{Adam}
\begin{figure}[H]
\centering
\includegraphics[scale = 0.4]{Adam.png}
\caption{The algorithm of Adam (Kingma \& Ba., 2017).}
\label{app:Adam}
\end{figure}
\subsection{EfficientNet}\label{Eff-Net}
\begin{figure}[H]
\centering
\includegraphics[scale = 0.2]{Eff-Net_compound.png}
\caption{Compound scaling of the EfficientNet (Tan \& Le, 2020)}
\label{app:EfficientNet}
\end{figure}
\newpage
\begin{landscape}
\subsection{Mean class-based accuracy assessments per experiment}
\begin{table}[H]
\resizebox{0.65\textwidth}{!}{\begin{minipage}{\textwidth}
\centering
\begin{tabular}{|l|l|l|l|l|l|l|l|}
\hline
Experiment & Input\_dataset & mean\_Precision & mean\_Recall(TPR) & mean\_Specificity(TNR) & mean\_OA & mean\_Dice & mean\_IoU \\ \hline
4 layer U-Net & KBY & 0.876088648455542 & 0.957042285911804 & 0.988149726987857 & 0.985645630780388 & 0.914777954002117 & 0.842940813242474 \\ \hline
5 layer U-Net & KBY & 0.887337540331463 & 0.954440097397393 & 0.989391029602743 & 0.986577538882985 & 0.919666425598122 & 0.85128005589134 \\ \hline
EB1-U-Net (IMN) & KBY & 0.906423641002363 & 0.968574427190988 & 0.99124608943309 & 0.989421059103573 & 0.936468974309045 & 0.880528119713893 \\ \hline
EB1-U-Net (NoIMN) & KBY & 0.902977163157557 & 0.960412447576424 & 0.990965811038674 & 0.988506317138672 & 0.930809641073578 & 0.870574293251397 \\ \hline
EB1-U-Net (OCC-transfer-Untrained) & KBY & 0.905312122866095 & 0.668110158849183 & 0.993882439015045 & 0.967658323400161 & 0.624472876164818 & 0.531484127044678 \\ \hline
EB1-U-Net (OCC-transfer-trained) & KBY & 0.905570133665066 & 0.974019928276047 & 0.991108207593434 & 0.989732630112592 & 0.938548648986219 & 0.88421259070406 \\ \hline
EB2-U-Net (IMN) & KBY & 0.909726348854154 & 0.962328896000268 & 0.991639967821063 & 0.989280476289637 & 0.935288587622051 & 0.878443282140799 \\ \hline
EB2-U-Net (NoIMN) & KBY & 0.900404947208695 & 0.948630819556973 & 0.990813896976398 & 0.987418230842141 & 0.923888978630002 & 0.85854429541461 \\ \hline
4 layer U-Net & ALL & 0.810815285020404 & 0.94124034523009 & 0.939060222754339 & 0.939533768577137 & 0.871173318487984 & 0.771751175584442 \\ \hline
5 layer U-Net & ALL & 0.817440288456213 & 0.939154520890219 & 0.941800109733497 & 0.941225459657866 & 0.874080622778446 & 0.776326121089972 \\ \hline
EB1-U-Net (IMN) & ALL & 0.787711495941837 & 0.936535774568188 & 0.929964235340173 & 0.931391643655711 & 0.855700929449568 & 0.747794830452809 \\ \hline
EB1-U-Net (NoIMN) & ALL & 0.778490971025587 & 0.916876282646316 & 0.927608962297886 & 0.925277709960938 & 0.842035737143943 & 0.727169019074135 \\ \hline
EB1-U-Net (OCC-transfer-Untrained) & ALL & 0.902935449121644 & 0.281553987580657 & 0.991601497838938 & 0.837371615705819 & 0.429256806098516 & 0.273282614093211 \\ \hline
EB1-U-Net (OCC-transfer-trained) & ALL & 0.780991001404883 & 0.930697856490893 & 0.927579611007119 & 0.928256927139458 & 0.84929763676765 & 0.738068908090146 \\ \hline
EB2-U-Net (IMN) & ALL & 0.781346604686634 & 0.93447336649723 & 0.927436932366202 & 0.928965320806394 & 0.8510771576798 & 0.740760933920581 \\ \hline
EB2-U-Net (NoIMN) & ALL & 0.801352691957954 & 0.922925754601378 & 0.93651633966192 & 0.933564319829831 & 0.857853369799912 & 0.751088649317845 \\ \hline
\end{tabular}
\end{minipage}}
\label{app:mean_data}
\end{table}
\end{landscape}
\end{document}