From: Benjamin Mako Hill <mako@atdot.cc>
Date: Sat, 28 Sep 2024 23:27:12 +0000 (-0700)
Subject: Merge branch 'COM481-2024Q4'
X-Git-Url: https://code.communitydata.science/coldcallbot-discord.git/commitdiff_plain/d6b74612e3a2f01112fa14c6c86bec9c20093ed3?hp=3c9c64edc856e6c8656614b5106b1f3f67704621

Merge branch 'COM481-2024Q4'
---

diff --git a/assessment_and_tracking/compute_final_case_grades.R b/assessment_and_tracking/compute_final_case_grades.R
index b26270b..93d6d1f 100644
--- a/assessment_and_tracking/compute_final_case_grades.R
+++ b/assessment_and_tracking/compute_final_case_grades.R
@@ -1,72 +1,148 @@
 ## load in the data
 #################################
+myuw <- read.csv("../data/2022_winter_COM_481_A_students.csv", stringsAsFactors=FALSE)
 
-myuw <- read.csv("myuw-COMMLD_570_A_spring_2021_students.csv", stringsAsFactors=FALSE)
+current.dir <- getwd()
+source("../assessment_and_tracking/track_participation.R")
+setwd(current.dir)
+
+rownames(d) <- d$unique.name
+call.list$timestamp <- as.Date(call.list$timestamp)
 
 ## class-level variables
-question.grades <- c("GOOD"=100, "FAIR"=100-(50/3.3), "WEAK"=100-(50/(3.3)*2))
+gpa.point.value <- 50/(4 - 0.7)
+question.grades <- c("PLUS"=100, "CHECK"=100-gpa.point.value, "MINUS"=100-(gpa.point.value*2))
+missed.question.penalty <- gpa.point.value * 0.2 ## 1/5 of a full point on the GPA scale
 
-source("../assessment_and_tracking/track_participation.R")
-setwd("case_grades")
+## inspect set the absence threashold
+ggplot(d) + aes(x=absences) + geom_histogram(binwidth=1, fill="white",color="black")
+absence.threshold <- median(d$absences)
 
-rownames(d) <- d$unique.name
+
+## inspect and set the questions cutoff
+## questions.cutoff <- median(d$num.calls)
+## median(d$num.calls)
+## questions.cutoff <- nrow(call.list) / nrow(d) ## TODO talk about this
+## this is the 95% percentile based on simulation in simulation.R
+questions.cutoff <- 4
 
 ## show the distribution of assessments
 table(call.list$assessment)
 prop.table(table(call.list$assessment))
-table(call.list$answered)
-prop.table(table(call.list$answered))
+
+table(call.list.full$answered)
+prop.table(table(call.list.full$answered))
 
 total.questions.asked <- nrow(call.list)
 
-## generate grades
-##########################################################
+## find out how man questions folks have present/absent for.
+## 
+## NOTE: this is currently only for informational purposes and is NOT
+## being used to compute grants in any way.
+########################################################################
+calls.per.day <- data.frame(day=as.Date(names(table(call.list$timestamp))),
+                            questions.asked=as.numeric(table(call.list$timestamp)))
+
+## function to return the numbers of calls present for or zero if they
+## were absent
+calls.for.student.day <- function (day, student.id) {
+    if (any(absence$unique.name == student.id & absence$date.absent == day)) {
+        return(0)
+    } else {
+        return(calls.per.day$questions.asked[calls.per.day$day == day])
+    }
+}
+
+compute.questions.present.for.student <- function (student.id) {
+    sum(unlist(lapply(unique(calls.per.day$day), calls.for.student.day, student.id)))
+}
 
-d$part.grade <- NA
+## create new column with number of questions present
+d$q.present <- unlist(lapply(d$unique.name, compute.questions.present.for.student))
+d$prop.asked <- d$num.calls / d$q.present
+
+## generate statistics using these new variables
+prop.asks.quantiles <- quantile(d$prop.asked, probs=seq(0,1, 0.01))
+prop.asks.quantiles <- prop.asks.quantiles[!duplicated(prop.asks.quantiles)]
+
+d$prop.asked.quant <- cut(d$prop.asked, right=FALSE, breaks=c(prop.asks.quantiles, 1),
+    labels=names(prop.asks.quantiles)[1:(length(prop.asks.quantiles))])
+
+## generate grades
+########################################################################
 
 ## print the median number of questions for (a) everybody and (b)
 ## people that have been present 75% of the time
 median(d$num.calls)
 
-questions.cutoff <- median(d$num.calls)
-
 ## helper function to generate average grade minus number of missing
 gen.part.grade <- function (x.unique.name) {
     q.scores <- question.grades[call.list$assessment[call.list$unique.name == x.unique.name]]
     base.score <- mean(q.scores, na.rm=TRUE)
 
     ## number of missing days
-    # missing.days <- nrow(missing.in.class[missing.in.class$unique.name == x.unique.name,])
+    missing.in.class.days <- nrow(missing.in.class[missing.in.class$unique.name == x.unique.name,])
 
     ## return the final score
     data.frame(unique.name=x.unique.name,
-               part.grade=(base.score))
+               base.grade=base.score,
+               missing.in.class.days=missing.in.class.days)
 }
 
 
+## create the base grades which do NOT include missing questions
 tmp <- do.call("rbind", lapply(d$unique.name, gen.part.grade))
+d <- merge(d, tmp)
+rownames(d) <- d$unique.name
+d$part.grade <- d$base.grade
 
-d[as.character(tmp$unique.name), "part.grade"] <- tmp$part.grade
+## first we handle the zeros
+## step 1: first double check the people who have zeros and ensure that they didn't "just" get unlucky"
+d[d$num.calls == 0,]
 
-## generate the baseline participation grades as per the process above
+## set those people to 0 :(
+d$part.grade[d$num.calls == 0] <- 0
 
-## map part grades back to 4.0 letter scale and points
-d$part.4point <-round((d$part.grade / (50/3.3)) - 2.6, 2)
+## step 2: identify the people who were were not asked "enough"
+## questions but were unlucky/lucky
+
+## first this just prints out are the people were were not called
+## simply because they got unlucky
+d[d$num.calls < questions.cutoff & d$absences < absence.threshold,]
+
+## these are the people were were not called simply unlucky (i.e.,
+## they were not in class very often)
+penalized.unique.names <- d$unique.name[d$num.calls < questions.cutoff & d$absences > absence.threshold]
+d[d$unique.name %in% penalized.unique.names,]
+
+## now add "zeros" for every questions that is below the normal
+d[as.character(penalized.unique.names),"part.grade"] <- (
+    (d[as.character(penalized.unique.names),"num.calls"] * d[as.character(penalized.unique.names),"part.grade"])
+    / questions.cutoff)
 
-d[sort.list(d$part.4point),]
+d[as.character(penalized.unique.names),]
 
+## apply the penality for number of days we called on them and they were gone
+d$part.grade <- d$part.grade - d$missing.in.class.days * missed.question.penalty
 
-## writing out data
+## TODO ensure this is right. i think it is
+## map part grades back to 4.0 letter scale and points
+d$part.4point <- round((d$part.grade / gpa.point.value) - ((100 / gpa.point.value) - 4), 2)
+
+d[sort.list(d$part.4point, decreasing=TRUE),
+  c("unique.name", "short.name", "num.calls", "absences", "part.4point")]
+
+## writing out data to CSV
 d.print <- merge(d, myuw[,c("StudentNo", "FirstName", "LastName", "UWNetID")],
-           by.x="student.num", by.y="StudentNo")
-write.csv(d.print, file="final_participation_grades.csv")
-
-## library(rmarkdown)
-
-## for (x.unique.name in d$unique.name) {
-##     render(input="../../assessment_and_tracking/student_report_template.Rmd",
-##            output_format="html_document",
-##            output_file=paste("../data/case_grades/student_reports/",
-##                              d.print$UWNetID[d.print$unique.name == x.unique.name],
-##                              sep=""))
-## }
+                 by.x="unique.name", by.y="StudentNo")
+write.csv(d.print, file="../data/final_participation_grades.csv")
+
+library(rmarkdown)
+
+for (id in d$unique.name) {
+    render(input="student_report_template.Rmd",
+           output_format="html_document",
+           output_file=paste("../data/case_grades/",
+                             d.print$unique.name[d.print$unique.name == id],
+                             sep=""))
+}
diff --git a/assessment_and_tracking/simulation.R b/assessment_and_tracking/simulation.R
new file mode 100644
index 0000000..7134bef
--- /dev/null
+++ b/assessment_and_tracking/simulation.R
@@ -0,0 +1,24 @@
+weight.fac <- 2
+num.calls <- 373
+num.students <- 76
+
+gen.calls.per.students <- function (x) {
+    raw.weights <<- rep(1, num.students)
+    names(raw.weights) <- seq(1, num.students)
+
+    table(sapply(1:num.calls, function (i) {
+        probs <- raw.weights / sum(raw.weights)
+        selected <- sample(names(raw.weights), 1, prob=probs)
+        ## update the raw.weights
+        raw.weights[selected] <<- raw.weights[selected] / weight.fac
+                                        #print(raw.weights)
+        return(selected)
+    }))
+}
+
+
+simulated.call.list <- unlist(lapply(1:1000, gen.calls.per.students))
+hist(simulated.call.list)
+
+quantile(simulated.call.list, probs=seq(0,1,by=0.01))
+quantile(simulated.call.list, probs=0.05)
diff --git a/assessment_and_tracking/student_report_template.Rmd b/assessment_and_tracking/student_report_template.Rmd
index a0b2145..866b1e0 100644
--- a/assessment_and_tracking/student_report_template.Rmd
+++ b/assessment_and_tracking/student_report_template.Rmd
@@ -1,22 +1,19 @@
-**Student Name:** `r paste(d.print[d.print$discord.name == x.discord.name, c("FirstName", "LastName")])`
+**Student Name:** `r paste(d.print[d.print$unique.name == id, c("LastName", "FirstName")])` (`r id`)
 
-**Discord Name:** `r d.print[d.print$discord.name == x.discord.name, c("discord.name")]`
+**Participation grade:** `r d.print$part.4point[d.print$unique.name == id]`
 
-**Participation grade:** `r d.print$part.4point[d.print$discord.name == x.discord.name]`
+**Questions asked:** `r d.print[d$unique.name == id, "num.calls"]`
 
-**Questions asked:** `r d.print[d$discord.name == x.discord.name, "prev.questions"]`
+**Days Absent:** `r d.print[d.print$unique.name == id, "absences"]` / `r length(unique(as.Date(unique(call.list$timestamp))))`
 
-**Days Absent:** `r d.print[d.print$discord.name == x.discord.name, "days.absent"]` / `r case.sessions`
+**Missing in class days:** `r d.print[d$unique.name == id, "missing.in.class.days"]` (base grade lowered by 0.2 per day)
 
 **List of questions:**
 
 ```{r echo=FALSE}
-call.list[call.list$discord.name == x.discord.name,]
+call.list[call.list$unique.name == id,]
 ```
 
-**Luckiness:** `r d.print[d.print$discord.name == x.discord.name, "prop.asked.quant"]`
-
-If you a student has a luckiness over 50% that means that they were helped by the weighting of the system and/or got lucky. We did not penalize *any* students with a luckiness under 50% for absences.
 
 
 
diff --git a/assessment_and_tracking/track_enrolled.R b/assessment_and_tracking/track_enrolled.R
index f0d0fcb..47e50c2 100644
--- a/assessment_and_tracking/track_enrolled.R
+++ b/assessment_and_tracking/track_enrolled.R
@@ -1,5 +1,5 @@
-myuw <- read.csv("myuw-COMMLD_570_A_spring_2021_students.csv")
-gs <- read.delim("student_information.tsv")
+myuw <- read.csv("../data/2022_winter_COM_481_A_students.csv")
+gs <- read.delim("../data/student_information.tsv")
 
 ## these are students who dropped the class (should be empty)
 gs[!gs$Your.UW.student.number %in% myuw$StudentNo,]
diff --git a/assessment_and_tracking/track_participation.R b/assessment_and_tracking/track_participation.R
index 28b8a4e..37898e7 100644
--- a/assessment_and_tracking/track_participation.R
+++ b/assessment_and_tracking/track_participation.R
@@ -1,25 +1,127 @@
 setwd("~/online_communities/coldcallbot/data/")
 
-library(ggplot2)
 library(data.table)
 
-gs <- read.delim("student_information.tsv")
-d <- gs[,c(2,4)]
-colnames(d) <- c("student.num", "unique.name")
+################################################
+## LOAD call_list TSV data
+################################################
 
 call.list <- do.call("rbind", lapply(list.files(".", pattern="^call_list-.*tsv$"), function (x) {read.delim(x, stringsAsFactors=FALSE)[,1:4]}))
 
 colnames(call.list) <- gsub("_", ".", colnames(call.list))
 
-table(call.list$unique_name[call.list$answered])
+table(call.list$unique.name[call.list$answered])
 
 ## drop calls where the person wasn't present
 call.list.full <- call.list
 call.list[!call.list$answered,]
 call.list <- call.list[call.list$answered,]
 
+## show the distribution of assessments
+prop.table(table(call.list$assessment))
+
 call.counts <- data.frame(table(call.list$unique.name))
 colnames(call.counts) <- c("unique.name", "num.calls")
 
-d <- merge(d, call.counts, all.x=TRUE, all.y=TRUE, by="unique.name"); d
+## create list of folks who are missing in class w/o reporting it
+absence.data.cols <- c("unique.name", "date.absent", "reported")
+
+missing.in.class <- call.list.full[!call.list.full$answered,
+                                   c("unique.name", "timestamp")]
+missing.in.class$date.absent <- as.Date(missing.in.class$timestamp)
+missing.in.class$reported <- FALSE
+missing.in.class <- missing.in.class[,absence.data.cols]
+missing.in.class <- unique(missing.in.class)
+
+################################################
+## LOAD absence data TSV data
+################################################
+
+absence.google <- read.delim("absence_poll_data.tsv")
+colnames(absence.google) <- c("timestamp", "unique.name", "date.absent")
+absence.google$date.absent <- as.Date(absence.google$date.absent, format="%m/%d/%Y")
+absence.google$reported <- TRUE
+absence.google <- absence.google[,absence.data.cols]
+absence.google <- unique(absence.google)
+
+## combine the two absence lists and then create a unique subset
+absence <- rbind(missing.in.class[,absence.data.cols],
+                 absence.google[,absence.data.cols])
+
+## these are people that show up in both lists (i.e., probably they
+## submitted too late but it's worth verifying before we penalize
+## them. i'd actually remove them from the absence sheet to suppress
+## this error
+absence[duplicated(absence[,1:2]),]
+absence <- absence[!duplicated(absence[,1:2]),]
+
+## print total questions asked and absences
+absence.count <- data.frame(table(unique(absence[,c("unique.name", "date.absent")])[,"unique.name"]))
+colnames(absence.count) <- c("unique.name", "absences")
+
+
+## load up the full class list
+gs <- read.delim("student_information.tsv")
+d <- gs[,c("Your.UW.student.number", "Name.you.d.like.to.go.by.in.class")]
+colnames(d) <- c("unique.name", "short.name")
+
+## merge in the call counts
+d <- merge(d, call.counts, all.x=TRUE, all.y=FALSE, by="unique.name")
+d <- merge(d, absence.count, by="unique.name", all.x=TRUE, all.y=FALSE)
+
+d
+
+## set anything that's missing to zero
+d$num.calls[is.na(d$num.calls)] <- 0
+d$absences[is.na(d$absences)] <- 0
+
+################################################
+## list people who have been absent often or called on a lot
+################################################
+
+
+## list students sorted in terms of (a) absences and (b) prev questions
+d[sort.list(d$absences),]
+
+d[sort.list(d$num.calls, decreasing=TRUE),]
+
+################################################
+## build visualizations
+################################################
+
+
+library(ggplot2)
+
+color.gradient <- scales::seq_gradient_pal("yellow", "magenta", "Lab")(seq(0,1,length.out=range(d$absences)[2]+1))
+
+table(d$num.calls, d$absences)
+
+png("questions_absence_histogram_combined.png", units="px", width=600, height=400)
+
+ggplot(d) +
+    aes(x=as.factor(num.calls), fill=as.factor(absences)) +
+    geom_bar(color="black") +
+    stat_count() +
+    scale_x_discrete("Number of questions answered") +
+    scale_y_continuous("Number of students") +
+    ##scale_fill_brewer("Absences", palette="Blues") +
+    scale_fill_manual("Absences", values=color.gradient) +
+    theme_bw()
+
+dev.off()
+
+absence.labeller <- function (df) {
+    lapply(df, function (x) { paste("Absences:", x) })
+}
+
+## png("questions_absence_histogram_facets.png", units="px", width=600, height=400)
+
+## ggplot(d) +
+##     aes(x=as.factor(num.calls)) +
+##     geom_bar() +
+##     stat_count() +
+##     scale_x_discrete("Number of questions answered") +
+##     scale_y_continuous("Number of students") +
+##     theme_bw() +
+##     facet_wrap(.~absences, ncol=5, labeller="absence.labeller")
 
diff --git a/coldcall.py b/coldcall.py
index 2250fac..37b4eb5 100644
--- a/coldcall.py
+++ b/coldcall.py
@@ -26,9 +26,10 @@ class ColdCall():
         self.__fn_daily_attendance = config["daily_attendance"].format(date=self.today)
 
         self.unique_row = config["unique_name_rowname"]
-        self.preferred_row = config["preferred_name_rowname"]
-
-        self.preferred_names = self.__get_preferred_names()
+        if "preferred_name_rowname" in config:
+            self.preferred_row = config["preferred_name_rowname"]
+        else:
+            self.preferred_row = None
         
     def __load_prev_questions(self):
         previous_questions = defaultdict(int)
@@ -59,7 +60,7 @@ class ColdCall():
         else:
             return None
 
-    def __select_student_from_list (self, students_present):
+    def select_student_from_list(self, students_present):
         prev_questions = self.__load_prev_questions()
         
         # created a weighted list by starting out with everybody 1
@@ -74,7 +75,7 @@ class ColdCall():
         # print(weights) # DEBUG LINE
         return choices(list(weights.keys()), weights=list(weights.values()), k=1)[0]
 
-    def __record_attendance(self, students_present):
+    def record_attendance(self, students_present):
         # if it's the first one of the day, write it out
         if not os.path.exists(self.__fn_daily_attendance):
             with open(self.__fn_daily_attendance, "w") as f:
@@ -86,7 +87,7 @@ class ColdCall():
                              ",".join(students_present)]),
                   file=f)
 
-    def __record_coldcall(self, selected_student):
+    def record_coldcall(self, selected_student):
         # if it's the first one of the day, write it out
         if not os.path.exists(self.__fn_daily_calllist):
             with open(self.__fn_daily_calllist, "w") as f:
@@ -100,12 +101,12 @@ class ColdCall():
                              "MISSING", "MISSING", str(datetime.now())]), file=f)
 
     def coldcall(self, students_present):
-        selected_student = self.__select_student_from_list(students_present)
+        selected_student = self.select_student_from_list(students_present)
 
         # record the called-upon student in the right place
         if self.record_attendance:
-            self.__record_attendance(students_present)
-        self.__record_coldcall(selected_student)
+            self.record_attendance(students_present)
+        self.record_coldcall(selected_student)
 
         preferred_name = self.__get_preferred_name(selected_student)
         if preferred_name:
diff --git a/coldcallbot-manual.py b/coldcallbot-manual.py
index a4268ea..6c128ba 100755
--- a/coldcallbot-manual.py
+++ b/coldcallbot-manual.py
@@ -1,15 +1,53 @@
 #!/usr/bin/env python3
 
 from coldcall import ColdCall
-import re
+from datetime import datetime
+from csv import DictReader
+
+current_time = datetime.today()
 
 ## create the coldcall object
-cc = ColdCall(record_attendance=False)
+cc = ColdCall(record_attendance=False, preferred_name_field="Name you'd like to go by in class")
+
+def get_missing(d=current_time):
+    date_string = f'{d.month}/{d.day}/{d.year}'
+    with open("data/absence_poll_data.tsv", 'r') as f:
+        for row in DictReader(f, delimiter="\t"):
+            if row["Date of class session you will be absent"] == date_string:
+                yield(row["Your UW student number"])
+
+full_names = {}
+registered_students = []
+with open("data/2022_winter_COM_481_A_students.csv", 'r') as f:
+    for row in DictReader(f, delimiter=","):
+        student_no = row["StudentNo"].strip()
+        registered_students.append(student_no)
+        full_names[student_no] = f"{row['FirstName']} {row['LastName']}"
+## print("Registered:", registered_students)
 
-student_list = cc.preferred_names
+missing_today = [x for x in get_missing(current_time)]
+## print("Missing Today: ", missing_today)
 
-# print out 100 students
+preferred_names = {}
+with open("data/student_information.tsv", 'r') as f:
+    for row in DictReader(f, delimiter="\t"):
+        preferred_names[row["Your UW student number"]] = row["Name you'd like to go by in class"]
+## print("Preferred names:", preferred_names)
+
+students_present = [s for s in registered_students if s not in missing_today]
+## print("Students present:", students_present)
 
 for i in range(100):
-    print(f"{i + 1}. {cc.coldcall(student_list)} [ ] [ ]\n")
+    selected_student = cc.select_student_from_list(students_present)
+
+    try:
+        preferred_name = preferred_names[selected_student]
+    except KeyError:
+        preferred_name = "MISSING PREFERRED NAME"
+
+    print(f"{i + 1}.",
+          preferred_name, "::",
+          selected_student, "::",
+          full_names[selected_student])
+    cc.record_coldcall(selected_student)