paperless-field-extractor/src/server.rs

886 lines
33 KiB
Rust

use std::{
collections::{BTreeMap, VecDeque},
hash::Hash,
path::Path,
sync::Arc,
time::Duration,
};
use actix_web::{
App, HttpResponse, HttpServer, ResponseError,
dev::HttpServiceFactory,
http::{StatusCode, Uri},
post,
web::{self, Data},
};
use itertools::Itertools;
use once_cell::sync::Lazy;
use paperless_api_client::{
Client,
types::{CustomField, CustomFieldInstance, Document, Tag},
};
use regex::Regex;
use serde::{Deserialize, Serialize};
use thiserror::Error;
use tokio::{
join, spawn,
sync::{Mutex, RwLock},
task::{JoinError, spawn_blocking},
};
use utoipa::{OpenApi, ToSchema};
use utoipa_swagger_ui::SwaggerUi;
static DOCID_REGEX: Lazy<Regex> = regex_static::lazy_regex!(r"documents/(\d*)");
static PROCESSING_QUEUE: Lazy<tokio::sync::RwLock<VecDeque<DocumentProcessingRequest>>> =
Lazy::new(|| RwLock::new(VecDeque::new()));
// shutdown bit, when this is set to true the document processing pipeline will be shut down
static STOP_FLAG: Lazy<tokio::sync::RwLock<bool>> = Lazy::new(|| RwLock::new(false));
// model will only be initialized and stored if there are documents that need processing
static MODEL_SINGLETON: Lazy<tokio::sync::Mutex<Option<LLModelExtractor>>> =
Lazy::new(|| Mutex::new(None));
use crate::{
config::Config,
extract::{LLModelExtractor, ModelError},
requests,
types::{
Decision, FieldError, FieldExtract, custom_field_learning_supported,
schema_from_decision_question,
},
};
#[derive(Debug, PartialEq, Clone)]
#[non_exhaustive]
enum ProcessingType {
CustomFieldPrediction,
CorrespondentSuggest,
DecsionTagFlow {
question: String,
true_tag: Option<Tag>,
false_tag: Option<Tag>,
},
}
impl Hash for ProcessingType {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
// this is only used to remove duplicate processing types in some operations
// these do not care about sub parameters, just about the kind of request
core::mem::discriminant(self).hash(state);
}
}
impl Eq for ProcessingType {}
/// Most processing of document will involve feeding document data throuh a large languae model.
/// Since LLM are notoriously resource intensive a task queue is used in order to facilitate asyncronous
/// batched processinsg a container is required to hold the queue of processing requests
struct DocumentProcessingRequest {
document: Document,
processing_type: ProcessingType,
overwrite_finshed_tag: Option<Tag>,
}
#[derive(Debug, Error)]
enum DocumentProcessingError {
#[error("Could not start LLM processinsg thread: {0}")]
SpawningProcessingThreadFailed(#[from] JoinError),
#[error(transparent)]
ModelProcessingError(#[from] ModelError),
#[error(transparent)]
PaperlessCommunicationError(#[from] paperless_api_client::types::error::Error),
#[error(transparent)]
ExtractionError(#[from] FieldError),
}
async fn handle_correspondend_suggest(
doc: &mut Document,
api_client: &mut Client,
) -> Result<(), DocumentProcessingError> {
let crrspndts = requests::fetch_all_correspondents(api_client).await;
let crrspndts_suggest_schema = crate::types::schema_from_correspondents(crrspndts.as_slice());
let doc_data = serde_json::to_value(&doc.content).unwrap();
let extracted_correspondent = spawn_blocking(move || {
let mut model_singleton = MODEL_SINGLETON.blocking_lock();
if let Some(model) = model_singleton.as_mut() {
model.extract(&doc_data, &crrspndts_suggest_schema, false)
} else {
Err(crate::extract::ModelError::ModelNotLoaded)
}
})
.await??;
log::debug!(
"Suggested new correspondent for doc {}: \n{extracted_correspondent:#?}",
doc.id
);
let extracted_correspondent: FieldExtract =
serde_json::from_value(extracted_correspondent).map_err(FieldError::from)?;
let new_crrspndt = extracted_correspondent.to_correspondent(&crrspndts)?;
doc.correspondent = Some(new_crrspndt.id);
// defered sync back to paperless instance
// after successfull finish the state of document on paperless will be updated by the update task
Ok(())
}
async fn handle_custom_field_prediction(
doc: &mut Document,
api_client: &mut Client,
) -> Result<(), DocumentProcessingError> {
// fetch all custom field definitions for fields on the document that need to be filled
let relevant_custom_fields: Vec<CustomField> = requests::get_custom_fields_by_id(
api_client,
doc.custom_fields.as_ref().map(|cfis| {
cfis.iter()
.filter(|cfi| cfi.value.is_none())
.map(|cfi| cfi.field)
.collect()
}),
)
.await
// this filters out all custom fields that are currently unsupported
.into_iter().filter(|cf| {
let learning_supported = custom_field_learning_supported(cf);
if !learning_supported {
log::warn!("Custom Fields with name `{}` are using an unsupported custom field type, will be ignored!", cf.name);
}
learning_supported
}).collect();
for cf in relevant_custom_fields {
let doc_data = serde_json::to_value(&doc).unwrap();
if let Some(field_grammar) = crate::types::schema_from_custom_field(&cf) {
let extracted_cf = spawn_blocking(move || {
let mut model_singleton = MODEL_SINGLETON.blocking_lock();
if let Some(model) = model_singleton.as_mut() {
model.extract(&doc_data, &field_grammar, false)
} else {
Err(crate::extract::ModelError::ModelNotLoaded)
}
})
.await??;
let extracted_cf: FieldExtract =
serde_json::from_value(extracted_cf).map_err(FieldError::from)?;
if let Ok(cf_value) = extracted_cf.to_custom_field_instance(&cf).map_err(|err| {
log::error!("{err}");
err
}) {
// update document custom fields on server side
// sending the updated document to the server will happen afterwards
log::debug!(
"Extracted custom field for document {}\n {:#?}",
doc.id,
cf_value
);
if let Some(doc_custom_fields) = doc.custom_fields.as_mut() {
for doc_cf_i in doc_custom_fields.iter_mut() {
if doc_cf_i.field == cf_value.field {
*doc_cf_i = cf_value.clone();
}
}
}
}
}
}
// defered sync back to paperless instance
// after successfull finish the state of document on paperless will be updated by the update task
Ok(())
}
async fn handle_decision(
doc: &mut Document,
question: &String,
true_tag: Option<&Tag>,
false_tag: Option<&Tag>,
) -> Result<(), DocumentProcessingError> {
let decision_schema = schema_from_decision_question(question);
let doc_data = serde_json::to_value(&doc.content).unwrap();
let extracted_answer = spawn_blocking(move || {
let mut model_singleton = MODEL_SINGLETON.blocking_lock();
if let Some(model) = model_singleton.as_mut() {
model.extract(&doc_data, &decision_schema, false)
} else {
Err(crate::extract::ModelError::ModelNotLoaded)
}
})
.await??;
let extracted_decision: Decision =
serde_json::from_value(extracted_answer).map_err(FieldError::from)?;
log::debug!(
"Made decision on document {}\n{extracted_decision:#?}",
doc.id
);
if let Some(true_tag) = true_tag
&& extracted_decision.answer_bool
&& !doc.tags.contains(&true_tag.id)
{
doc.tags.push(true_tag.id);
}
if let Some(false_tag) = false_tag
&& !extracted_decision.answer_bool
&& !doc.tags.contains(&false_tag.id)
{
doc.tags.push(false_tag.id);
}
Ok(())
}
#[derive(Debug, thiserror::Error)]
enum WebhookError {
#[error("Document with id `{0}` does not exist!")]
DocumentDoesNotExist(i64),
#[error("Document ID is not a valid integer!")]
InvalidDocumentId,
#[error("Could not parse Document ID from `document_url` field!")]
DocumentUrlParsingIDFailed,
#[error("Document Url points to a server unrelated to this configuration. Ignoring Request")]
ReceivedRequestFromUnconfiguredServer,
#[error("Request specified tag {0}, but it could not be found, neither id nor name exists!")]
TagNotFound(String),
#[error(
"The request is configured in a way that nothing is going to happen when the request completes, so it will be ignored!"
)]
RequestWithoutEffect,
}
#[derive(Debug, Serialize, Deserialize, ToSchema)]
struct DecisionTagFlowRequest {
/// url of the document that should be processed
document_url: String,
/// question about the document, should be answerable with true/false
question: String,
/// optional tag to assign if answer is true
true_tag: Option<String>,
/// optional tag to assign if the answer is false
false_tag: Option<String>,
}
#[derive(Debug, Serialize, Deserialize, ToSchema)]
/// General webhook parameters any workflow trigger will accept this type
struct WebhookParams {
/// url of the document that should be processed
document_url: String,
#[serde(default)]
/// tag to apply to document when finished with processing, this is optional if unspecfied the configured finsh tag will be set
next_tag: Option<String>,
}
impl ResponseError for WebhookError {
fn status_code(&self) -> actix_web::http::StatusCode {
match self {
WebhookError::ReceivedRequestFromUnconfiguredServer => StatusCode::UNAUTHORIZED,
_ => StatusCode::BAD_REQUEST,
}
}
}
impl WebhookParams {
async fn handle_request(
&self,
status_tags: Data<PaperlessStatusTags>,
api_client: Data<Client>,
config: Data<Config>,
document_pipeline: web::Data<tokio::sync::mpsc::UnboundedSender<DocumentProcessingRequest>>,
req_type: ProcessingType,
) -> Result<(), WebhookError> {
let doc_url: Uri = self.document_url.parse().unwrap();
let configured_paperless_server: Uri = config.paperless_server.parse().unwrap();
// verify document host and configured paperless instance are the same host to avoid handling requests from other paperless instances
if doc_url.host().is_some_and(|rs| {
configured_paperless_server
.host()
.is_some_and(|cs| cs == rs)
}) {
if let Some(doc_id_cap) = DOCID_REGEX.captures(doc_url.path()) {
if let Some(doc_id) = doc_id_cap
.get(1)
.and_then(|v| v.as_str().parse::<i64>().ok())
{
let mut api_client =
Arc::<Client>::make_mut(&mut api_client.into_inner()).clone();
if let Ok(mut doc) = api_client.documents().retrieve(doc_id, None, None).await {
// if documents has no processing tag set it
if !doc.tags.contains(&status_tags.processing.id) {
let mut updated_doc_tags = doc.tags.clone();
updated_doc_tags.push(status_tags.processing.id);
let _ = requests::update_document_tag_ids(
&mut api_client,
&mut doc,
&updated_doc_tags,
)
.await;
}
let mut next_tag = None;
if let Some(nt_to_parse) = &self.next_tag {
if let Ok(nt_as_id) = nt_to_parse.parse::<i64>() {
next_tag = requests::fetch_tag_by_id_or_name(
&mut api_client,
None,
Some(nt_as_id),
)
.await;
} else {
next_tag = requests::fetch_tag_by_id_or_name(
&mut api_client,
Some(nt_to_parse.clone()),
None,
)
.await;
}
}
if next_tag.is_none() && self.next_tag.is_some() {
log::warn!(
"Webhook received request to use specific finished tag, but the tag does not exists (next_tag=`{}`)! Ignoring tag from request!",
self.next_tag.as_ref().unwrap()
);
}
let _ = document_pipeline.send(DocumentProcessingRequest {
document: doc,
processing_type: req_type,
overwrite_finshed_tag: next_tag,
});
Ok(())
} else {
Err(WebhookError::DocumentDoesNotExist(doc_id))
}
} else {
Err(WebhookError::InvalidDocumentId)
}
} else {
Err(WebhookError::DocumentUrlParsingIDFailed)
}
} else {
Err(WebhookError::ReceivedRequestFromUnconfiguredServer)
}
}
}
#[utoipa::path(tag = "llm_workflow_trigger", request_body = inline(DecisionTagFlowRequest))]
#[post("/decision")]
/// ask a question that can be answered with true or false about the document
///
/// The goal of this endpoint is to enable decision based workflows in paperless. Ask a question about the document and
/// if the answer is true the document will be assigend the `true_tag`. If not and the `false_tag` is specified it will be assigned.
/// This enables doing `if/else` style workflows by using tagging to conditionally trigger further processing steps.
///
/// If neither `false_tag` nor `true_tag` are specified the request will be discared since the result would have no effect!
async fn decision(
params: web::Json<DecisionTagFlowRequest>,
status_tags: Data<PaperlessStatusTags>,
api_client: Data<Client>,
config: Data<Config>,
document_pipeline: web::Data<tokio::sync::mpsc::UnboundedSender<DocumentProcessingRequest>>,
) -> Result<HttpResponse, WebhookError> {
let mut api_client_cloned =
Arc::<Client>::make_mut(&mut api_client.clone().into_inner()).clone();
let mut true_tag = None;
if let Some(tt_to_parse) = &params.true_tag {
if let Ok(tt_as_id) = tt_to_parse.parse::<i64>() {
true_tag =
requests::fetch_tag_by_id_or_name(&mut api_client_cloned, None, Some(tt_as_id))
.await;
} else {
true_tag = requests::fetch_tag_by_id_or_name(
&mut api_client_cloned,
Some(tt_to_parse.clone()),
None,
)
.await;
}
}
// a true tag is expected, if none could be found then the request is invalid
if true_tag.is_none() && params.true_tag.is_some() {
return Err(WebhookError::TagNotFound(params.true_tag.clone().unwrap()));
}
let mut false_tag = None;
if let Some(ft_to_parse) = &params.false_tag {
if let Ok(ft_as_id) = ft_to_parse.parse::<i64>() {
false_tag =
requests::fetch_tag_by_id_or_name(&mut api_client_cloned, None, Some(ft_as_id))
.await;
} else {
false_tag = requests::fetch_tag_by_id_or_name(
&mut api_client_cloned,
Some(ft_to_parse.clone()),
None,
)
.await;
}
}
// if the request specified a false tag but it could not be found, this is also an error
// since the user specifically wants to have a tag on false result, if the tag does not
// exists then this will not work, so the request is invalid
if false_tag.is_none() && params.false_tag.is_some() {
return Err(WebhookError::TagNotFound(params.false_tag.clone().unwrap()));
}
if true_tag.is_none() && false_tag.is_none() {
return Err(WebhookError::RequestWithoutEffect);
}
let process_type = ProcessingType::DecsionTagFlow {
question: params.question.clone(),
true_tag,
false_tag,
};
let generic_webhook_params = WebhookParams {
document_url: params.document_url.clone(),
next_tag: None,
};
generic_webhook_params
.handle_request(
status_tags,
api_client,
config,
document_pipeline,
process_type,
)
.await?;
Ok(HttpResponse::Accepted().into())
}
#[utoipa::path(tag = "llm_workflow_trigger")]
#[post("/suggest/correspondent")]
/// Workflow to suggest a correspondent for a document
///
/// Given the document content and all possible correspondents select a correspondent using a
/// reasoning approach. This workflow might take longer given the llm reasoning!
///
/// Afterwards set the correspondent of the document, sadly just adding it as another suggestion is not supported
/// by the paperless api.
async fn suggest_correspondent(
params: web::Json<WebhookParams>,
status_tags: Data<PaperlessStatusTags>,
api_client: Data<Client>,
config: Data<Config>,
document_pipeline: web::Data<tokio::sync::mpsc::UnboundedSender<DocumentProcessingRequest>>,
) -> Result<HttpResponse, WebhookError> {
params
.handle_request(
status_tags,
api_client,
config,
document_pipeline,
ProcessingType::CorrespondentSuggest,
)
.await?;
Ok(HttpResponse::Accepted().into())
}
#[utoipa::path(tag = "llm_workflow_trigger")]
#[post("/fill/custom_fields")]
/// Workflow to fill unfilled custom fields on a document
///
/// Scan document for unfilled custom fields and use llm to predict the values from the document content.
///
/// ## Supported Custom Field Types
///
/// Currently this projects predicting the following kinds of custom fields:
/// - [x] Boolean
/// - [x] Date
/// - [x] Integer
/// - [x] Number
/// - [x] Monetary
/// - [x] Text
/// - [x] Select
/// - [ ] Document Link
/// - [ ] URL
/// - [ ] LargeText
async fn custom_field_prediction(
params: web::Json<WebhookParams>,
status_tags: Data<PaperlessStatusTags>,
api_client: Data<Client>,
config: Data<Config>,
document_pipeline: web::Data<tokio::sync::mpsc::UnboundedSender<DocumentProcessingRequest>>,
) -> Result<HttpResponse, WebhookError> {
params
.handle_request(
status_tags,
api_client,
config,
document_pipeline,
ProcessingType::CustomFieldPrediction,
)
.await?;
Ok(HttpResponse::Accepted().into())
}
#[derive(utoipa::OpenApi)]
#[openapi(
paths(suggest_correspondent, custom_field_prediction, decision),
components(schemas(WebhookParams))
)]
pub(crate) struct DocumentProcessingApiSpec;
struct DocumentProcessingApi;
impl HttpServiceFactory for DocumentProcessingApi {
fn register(self, config: &mut actix_web::dev::AppService) {
custom_field_prediction.register(config);
suggest_correspondent.register(config);
decision.register(config);
}
}
/// given an updated document add changes from processing type to other instances of the document
///
/// the purpose of this function is the following situation, given a document may be present multiple times in the processing queue
/// and with later document versions having potentially more information in them this function should update later versions of the
/// document with data from previously run processing steps without loosing any additional data. The goal being to mininmize the amount
/// of back communication with the paperless server to avoid updating documents multiple times and triggering workflows unnecessarìly
fn merge_document_status(
doc: &mut Document,
updated_doc: &Document,
processing_type: &ProcessingType,
) {
if doc.id != updated_doc.id {
// if document ids do not match stop here!
return;
}
match processing_type {
ProcessingType::CustomFieldPrediction => {
if let Some(updated_custom_fields) = &updated_doc.custom_fields {
for updated_cf in updated_custom_fields {
if let Some(doc_custom_fields) = doc.custom_fields.as_mut() {
let mut cf_found = false;
for doc_cf_i in &mut *doc_custom_fields {
if doc_cf_i.field == updated_cf.field {
cf_found = true;
doc_cf_i.value = updated_cf.value.clone();
}
}
if !cf_found {
doc_custom_fields.push(updated_cf.clone());
}
}
}
}
}
ProcessingType::CorrespondentSuggest => doc.correspondent = updated_doc.correspondent,
ProcessingType::DecsionTagFlow {
question: _,
true_tag: _,
false_tag: _,
} => {
// this processing type adds tags to the document
for updated_tag in &updated_doc.tags {
if !doc.tags.contains(updated_tag) {
doc.tags.push(*updated_tag);
}
}
}
}
}
/// this jobs function is to receive the processed and send the results back the paperless instance
///
/// the goal is to minimize traffic to the paperless instance and avoid waiting for api requests
async fn document_updater(
status_tags: PaperlessStatusTags,
mut api_client: Client,
mut document_update_channel: tokio::sync::mpsc::UnboundedReceiver<
Result<
(DocumentProcessingRequest, bool),
(DocumentProcessingError, DocumentProcessingRequest, bool),
>,
>,
) {
let mut defered_doc_updates: BTreeMap<i64, Vec<ProcessingType>> = BTreeMap::new();
while let Some(doc_update) = document_update_channel.recv().await {
let mut _maybe_error = None;
let same_doc_in_queue_again;
let doc_req = match doc_update {
Ok((doc_req, queued_again)) => {
same_doc_in_queue_again = queued_again;
doc_req
}
Err((err, doc_req, queued_again)) => {
same_doc_in_queue_again = queued_again;
_maybe_error = Some(err);
doc_req
}
};
// if there are now further processing steps pending for the document then it's
// state can be synced back to the paperless server and all processing tags removed
// finshed / next tag will be set
if !same_doc_in_queue_again {
let updated_doc_tags: Vec<i64> = doc_req
.document
.tags
.iter()
.copied()
.filter(|t| *t != status_tags.processing.id)
.chain(if doc_req.overwrite_finshed_tag.is_none() {
[status_tags.finished.id].into_iter()
} else {
[doc_req.overwrite_finshed_tag.as_ref().unwrap().id].into_iter()
})
.unique()
.collect();
let mut updated_cf: Option<Vec<CustomFieldInstance>> = None;
let mut updated_crrspdnt: Option<i64> = None;
for doc_processing_steps in [doc_req.processing_type]
.iter()
.chain(
defered_doc_updates
.get(&doc_req.document.id)
.unwrap_or(&Vec::new()),
)
.unique()
{
match doc_processing_steps {
ProcessingType::CustomFieldPrediction => {
if let Some(cfis) = doc_req.document.custom_fields.as_ref() {
updated_cf = Some(cfis.clone());
}
}
ProcessingType::CorrespondentSuggest => {
updated_crrspdnt = doc_req.document.correspondent;
}
ProcessingType::DecsionTagFlow {
question: _,
true_tag: _,
false_tag: _,
} => {
// nothing needs to happen here, the updated tags are already part of the document
// since they are synced with the same document in the queue
}
}
}
let _ = requests::processed_doc_update(
&mut api_client,
doc_req.document.id,
updated_doc_tags,
updated_crrspdnt,
updated_cf,
)
.await
.map_err(|err| {
log::error!("{err}");
err
});
} else {
// remember how document has been processed until now for defered update later
if let std::collections::btree_map::Entry::Vacant(e) =
defered_doc_updates.entry(doc_req.document.id)
{
e.insert(vec![doc_req.processing_type]);
} else if defered_doc_updates
.get(&doc_req.document.id)
.is_some_and(|v| v.contains(&doc_req.processing_type))
{
continue;
} else if let Some(v) = defered_doc_updates.get_mut(&doc_req.document.id).as_mut() {
v.push(doc_req.processing_type);
}
}
}
}
// future performance optimization needs to focus on this function, it should dispatch to batch processing of documents
// or could combine requests to the same document in the queue.
/// this jobs functions it to batch process documents using the llm and send the results on to the update handler
async fn document_processor(
config: Config,
mut api_client: Client,
document_update_channel: tokio::sync::mpsc::UnboundedSender<
Result<
(DocumentProcessingRequest, bool),
(DocumentProcessingError, DocumentProcessingRequest, bool),
>,
>,
) {
while !*STOP_FLAG.read().await {
while !PROCESSING_QUEUE.read().await.is_empty() {
let model_path = config.model.clone();
{
let mut model_singleton = MODEL_SINGLETON.lock().await;
if model_singleton.is_none() {
let max_ctx = if config.max_ctx == 0 { None } else { Some(config.max_ctx as u32) };
*model_singleton = spawn_blocking(move || {
LLModelExtractor::new(Path::new(&model_path), config.num_gpu_layers, max_ctx)
})
.await
.map_err(|err| {
log::error!("Error loading Model! {err}");
ModelError::ModelNotLoaded
})
.and_then(|r| r)
.ok();
}
}
let mut doc_process_req = {
// nesting here to ensure write lock is dropped while processing the document in the next step
PROCESSING_QUEUE
.write()
.await
.pop_front()
.expect("Size is greater 0 so there must be a document in the queue")
};
let processing_result = match doc_process_req.processing_type {
ProcessingType::CustomFieldPrediction => {
handle_custom_field_prediction(&mut doc_process_req.document, &mut api_client)
.await
}
ProcessingType::CorrespondentSuggest => {
handle_correspondend_suggest(&mut doc_process_req.document, &mut api_client)
.await
}
ProcessingType::DecsionTagFlow {
ref question,
ref true_tag,
ref false_tag,
} => {
handle_decision(
&mut doc_process_req.document,
question,
true_tag.as_ref(),
false_tag.as_ref(),
)
.await
}
};
let mut doc_in_queue_again = false;
match processing_result {
Ok(_) => {
// if the same document has more processing requests pending update the doc state to
// also contain the newly added data.
for next_process_req in PROCESSING_QUEUE.write().await.iter_mut() {
if next_process_req.document.id == doc_process_req.document.id {
doc_in_queue_again = true;
merge_document_status(
&mut next_process_req.document,
&doc_process_req.document,
&doc_process_req.processing_type,
);
}
}
let _ = document_update_channel.send(Ok((doc_process_req, doc_in_queue_again)));
}
Err(err) => {
doc_in_queue_again = PROCESSING_QUEUE
.read()
.await
.iter()
.map(|doc_req| doc_req.document.id)
.contains(&doc_process_req.document.id);
let _ = document_update_channel.send(Err((
err,
doc_process_req,
doc_in_queue_again,
)));
}
}
}
tokio::time::sleep(Duration::from_secs(2)).await;
if PROCESSING_QUEUE.read().await.is_empty() && MODEL_SINGLETON.lock().await.is_some() {
// No Documents need processing drop model
log::info!("Unloading Model due to processing queue being empty!");
let mut model_singleton = MODEL_SINGLETON.lock().await;
let _ = model_singleton.take();
}
}
}
/// this function is just here to receive documents for processing from the different api endpoints
/// all documents received via the channel are put into a linked list of documents that need processing
/// the reason for this is that this way the document processor can inspect the state of the document queue and
/// make smart decisions on how to process the documents for maximum efficiency
async fn document_request_funnel(
mut processing_request_channel: tokio::sync::mpsc::UnboundedReceiver<DocumentProcessingRequest>,
) {
while let Some(prc_req) = processing_request_channel.recv().await {
log::debug!("Received Request for Document {:#?}", prc_req.document.id);
let mut doc_queue = PROCESSING_QUEUE.write().await;
doc_queue.push_back(prc_req);
}
*STOP_FLAG.write().await = true;
}
#[derive(Debug, Clone)]
struct PaperlessStatusTags {
processing: Tag,
finished: Tag,
}
pub async fn run_server(
config: Config,
processing_tag: Tag,
finished_tag: Tag,
paperless_api_client: Client,
) -> Result<(), std::io::Error> {
let (tx, rx) = tokio::sync::mpsc::unbounded_channel::<DocumentProcessingRequest>();
let (tx_update, rx_update) = tokio::sync::mpsc::unbounded_channel();
let status_tags = PaperlessStatusTags {
processing: processing_tag,
finished: finished_tag,
};
let doc_to_process_queue = spawn(document_request_funnel(rx));
let doc_processor = spawn(document_processor(
config.clone(),
paperless_api_client.clone(),
tx_update,
));
let doc_update_task = spawn(document_updater(
status_tags.clone(),
paperless_api_client.clone(),
rx_update,
));
let server_config = config.clone();
let webhook_server = HttpServer::new(move || {
App::new()
.app_data(Data::new(tx.clone()))
.app_data(Data::new(server_config.clone()))
.app_data(Data::new(paperless_api_client.clone()))
.app_data(Data::new(status_tags.clone()))
.service(
SwaggerUi::new("/api/{_:.*}")
.config(utoipa_swagger_ui::Config::default().use_base_layout())
.url("/docs/openapi.json", DocumentProcessingApiSpec::openapi()),
)
.service(DocumentProcessingApi)
})
.bind((config.host, config.port))?
.run();
let _ = join!(
webhook_server,
doc_to_process_queue,
doc_processor,
doc_update_task
);
Ok(())
}