-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdna_classifier.py
31 lines (23 loc) · 1.25 KB
/
dna_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
# Load the DNA profiles data into a Pandas dataframe
dna_profiles = pd.read_csv("dna_profiles.csv")
# Split the data into training and testing sets
train_size = int(0.8 * len(dna_profiles))
training_data = dna_profiles[:train_size]
testing_data = dna_profiles[train_size:]
# Ensures there's data in both training and testing datasets
if len(training_data) == 0 or len(testing_data) == 0:
raise ValueError("Insufficient data for training or testing. Ensure your CSV has enough records.")
# Ensure n_neighbors is less than or equal to the number of training samples
num_neighbors = min(5, len(training_data))
# Create the classifier with the specified number of neighbors
classifier = KNeighborsClassifier(n_neighbors=num_neighbors)
# Train the classifier using the DNA profile markers as features and "Label" as the target
classifier.fit(training_data[["A", "C", "G", "T"]], training_data["Label"])
# Test the classifier on the testing data
predictions = classifier.predict(testing_data[["A", "C", "G", "T"]])
# Compare the predictions to the actual labels
correct_predictions = sum(predictions == testing_data["Label"])
accuracy = correct_predictions / len(testing_data)
print("Accuracy:", accuracy)